2017-10-04 03:38:48 +00:00
|
|
|
# ABC_makeMYSPElist.R
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
|
|
|
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
|
|
|
# Mbp1 homologues.
|
|
|
|
#
|
2017-10-04 03:38:48 +00:00
|
|
|
# Version: 1.1.1
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
2017-09-21 21:49:55 +00:00
|
|
|
# Date: 2016 09 - 2017 09
|
2017-09-12 20:09:20 +00:00
|
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
|
|
#
|
|
|
|
# V 1.1 Update 2017
|
|
|
|
# V 1.0 First code 2016
|
|
|
|
#
|
|
|
|
# TODO:
|
2017-09-21 21:49:55 +00:00
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
# type out workflow
|
|
|
|
#
|
|
|
|
# ==============================================================================
|
2017-09-21 21:49:55 +00:00
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
# DO NOT source() THIS FILE!
|
2017-09-21 21:49:55 +00:00
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
# This file is code I provide for your deeper understanding of a process and
|
|
|
|
# to provide you with useful sample code. It is not actually necessary for
|
|
|
|
# you to run this code, but I encourage you to read it carefully and discuss
|
|
|
|
# if there are parts you don't understand.
|
2017-09-21 21:49:55 +00:00
|
|
|
#
|
2017-09-12 20:09:20 +00:00
|
|
|
# Run the commands that interact with the NCBI servers only if you want to
|
2017-09-21 21:49:55 +00:00
|
|
|
# experiment specifically with the code and/or parameters. I have commented out
|
|
|
|
# those parts. If you only want to study the general workflow, just load()
|
|
|
|
# the respective intermediate results.
|
|
|
|
#
|
2017-10-04 03:38:48 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
#TOC> ==========================================================================
|
2017-10-04 03:38:48 +00:00
|
|
|
#TOC>
|
2017-09-21 21:49:55 +00:00
|
|
|
#TOC> Section Title Line
|
|
|
|
#TOC> ---------------------------------------------------
|
|
|
|
#TOC> 1 The strategy 54
|
|
|
|
#TOC> 2 GOLD species 66
|
|
|
|
#TOC> 2.1 Initialize 71
|
|
|
|
#TOC> 2.2 Import 77
|
|
|
|
#TOC> 2.3 Unique species 129
|
|
|
|
#TOC> 3 BLAST species 171
|
|
|
|
#TOC> 3.1 find homologous proteins 178
|
|
|
|
#TOC> 3.2 Identify species in "hits" 202
|
|
|
|
#TOC> 4 Intersect GOLD and BLAST species 247
|
|
|
|
#TOC> 5 Cleanup and finish 265
|
2017-10-04 03:38:48 +00:00
|
|
|
#TOC>
|
2017-09-21 21:49:55 +00:00
|
|
|
#TOC> ==========================================================================
|
2017-10-04 03:38:48 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
|
|
|
|
#TOC>
|
|
|
|
#TOC>
|
|
|
|
|
|
|
|
# = 1 The strategy ========================================================
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-10-04 03:38:48 +00:00
|
|
|
# This script will create a list of "MYSPE" species and save it in an R object
|
|
|
|
# MYSPEspecies that is stored in the data subdirectory of this project from where
|
2017-09-21 21:49:55 +00:00
|
|
|
# it can be loaded. The strategy is as follows: we download a list of all
|
|
|
|
# genome projects and then select species for which protein annotations are
|
|
|
|
# available - i.e. these are all genome-sequenced species that have been
|
|
|
|
# annotated. Then we search for fungal species that have homologues to MBP1.
|
|
|
|
# Then we intersect the two lists to give us genome-sequenced species that
|
|
|
|
# also have Mbp1 homologues ...
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# = 2 GOLD species ========================================================
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# Fetch and parse the Genomes OnLine Database of the Joint Genome Institute
|
|
|
|
# (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
|
|
|
|
|
|
|
|
# == 2.1 Initialize ========================================================
|
2017-09-12 20:09:20 +00:00
|
|
|
if (!require(httr)) { # httr provides interfaces to Webservers on the Internet
|
|
|
|
install.packages("httr")
|
|
|
|
library(httr)
|
|
|
|
}
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# == 2.2 Import ============================================================
|
|
|
|
|
|
|
|
# The URL of the genome data directory at the NCBI:
|
|
|
|
# is https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS
|
|
|
|
# Note the relative size of the prokaryotes and the eukaryotes data.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# What's in this directory?
|
|
|
|
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/README"
|
|
|
|
GOLDreadme <- readLines(URL) # read the file into a vector
|
|
|
|
cat(GOLDreadme, sep = "\n") # display the contents
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# Retrieve the file "eukaryotes" via ftp from the NCBI ftp server and put it
|
|
|
|
# into a dataframe. This will take a few moments.
|
|
|
|
# URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
|
2017-09-12 20:09:20 +00:00
|
|
|
# GOLDdata <- read.csv(URL,
|
|
|
|
# header = TRUE,
|
|
|
|
# sep = "\t",
|
|
|
|
# stringsAsFactors = FALSE)
|
|
|
|
# save(GOLDdata, file="data/GOLDdata.RData")
|
2017-09-21 21:49:55 +00:00
|
|
|
# or ...
|
2017-09-12 20:09:20 +00:00
|
|
|
load(file="data/GOLDdata.RData")
|
|
|
|
|
|
|
|
|
|
|
|
# What columns does the table have, how is it structured?
|
|
|
|
str(GOLDdata)
|
|
|
|
|
|
|
|
# What groups of organisms are in the table? How many of each?
|
|
|
|
table(GOLDdata$Group)
|
|
|
|
|
|
|
|
# What subgroups of fungi do we have?
|
|
|
|
table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# How many of the fungi have protein annotations? The README file told us that
|
|
|
|
# the column "Proteins" contains "Number of Proteins annotated in the assembly".
|
|
|
|
# Looking at a few ...
|
|
|
|
head(GOLDdata$Proteins, 30)
|
|
|
|
# ... we see that the number varies, and some have a hyphen, i.e. no
|
|
|
|
# annotations. The hyphens make this a char type column (as per: all elements
|
|
|
|
# of a vector must have the same type). Therefore we can't read this as numbers
|
|
|
|
# and filter by some value > 0. But we can filter for all genomes that don't
|
|
|
|
# have the hyphen:
|
2017-09-12 20:09:20 +00:00
|
|
|
sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# Subset the data, with fungi that have protein annotations
|
2017-09-12 20:09:20 +00:00
|
|
|
GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
|
|
|
|
GOLDdata$Proteins != "-" , ]
|
|
|
|
|
|
|
|
# check what we have in the table
|
2017-09-21 21:49:55 +00:00
|
|
|
nrow(GOLDfungi)
|
2017-09-12 20:09:20 +00:00
|
|
|
head(GOLDfungi)
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
|
|
|
|
# == 2.3 Unique species ====================================================
|
|
|
|
|
|
|
|
# For our purpose of defining species, we will select only species, not strains
|
|
|
|
# from this list. To do this, we pick the first two words i.e. the systematic
|
|
|
|
# binomial name from the "X.Organism.Name" column, and then we remove redundant
|
|
|
|
# species. Here is a function:
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
getBinom <- function(s) {
|
|
|
|
# Fetch the first two words from a string.
|
|
|
|
# Parameters:
|
|
|
|
# s: char a string which is expected to contain a binomial species name
|
|
|
|
# as the first two words, possibly followed by other text.
|
|
|
|
# Value: char the first two words separated by a single blank
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
2017-09-21 21:49:55 +00:00
|
|
|
x <- unlist(strsplit(s, "\\s+")) # split s on one or more whitespace
|
2017-09-12 20:09:20 +00:00
|
|
|
return(paste(x[1:2], collapse=" ")) # return first two elements
|
|
|
|
}
|
|
|
|
|
|
|
|
# iterate through GOLDdata and extract species names
|
|
|
|
GOLDspecies <- character()
|
|
|
|
for (i in 1:nrow(GOLDfungi)) {
|
2017-09-21 21:49:55 +00:00
|
|
|
GOLDspecies[i] <- getBinom(GOLDfungi$X.Organism.Name[i])
|
2017-09-12 20:09:20 +00:00
|
|
|
}
|
2017-09-21 21:49:55 +00:00
|
|
|
head(GOLDspecies)
|
|
|
|
length(GOLDspecies)
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# N.b. this would be more efficiently (but perhaps less explicitly) coded with
|
|
|
|
# one of the apply() functions, instead of a for-loop.
|
|
|
|
# GOLDspecies <- unlist(lapply(GOLDfungi$X.Organism.Name, getBinom))
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# Species of great interest may appear more than once, one for each sequenced
|
|
|
|
# strain: e.g. brewer's yeast:
|
2017-09-12 20:09:20 +00:00
|
|
|
sum(GOLDspecies == "Saccharomyces cerevisiae")
|
|
|
|
|
|
|
|
# Therefore we use the function unique() to throw out duplicates. Simple:
|
|
|
|
GOLDspecies <- unique(GOLDspecies)
|
|
|
|
|
|
|
|
length(GOLDspecies)
|
2017-09-21 21:49:55 +00:00
|
|
|
# i.e. we got rid of about 40% of the species by removing duplicates.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# = 3 BLAST species =======================================================
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
2017-09-21 21:49:55 +00:00
|
|
|
# Next, we filter our list by species that have homologues to the yeast Mbp1
|
|
|
|
# gene. To do this we run a BLAST search to find all related proteins in any
|
|
|
|
# fungus. We list the species that appear in that list, and then we select those
|
|
|
|
# that appear in our GOLD table as well.
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
2017-09-21 21:49:55 +00:00
|
|
|
# == 3.1 find homologous proteins ==========================================
|
|
|
|
#
|
|
|
|
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
|
|
|
|
# contain them.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
|
|
|
|
# amount of error handling involved that is not supported by the API in a
|
|
|
|
# principled way but requires rather ad hoc solutions. The code I threw together
|
|
|
|
# to make a BLAST interface (demo-quality, not research-quality) is in the file
|
|
|
|
# BLAST.R Feel encouraged to study how this works. It's a pretty standard task
|
|
|
|
# of communicating with servers and parsing responses - everyday fare in the
|
|
|
|
# bioinformatics lab. Surprisingly, there seems to be no good BLAST parser
|
|
|
|
# in currently available packages.
|
|
|
|
|
|
|
|
# source("BLAST.R") # load the function and its utilities
|
2017-09-12 20:09:20 +00:00
|
|
|
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
2017-09-21 21:49:55 +00:00
|
|
|
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
|
|
|
# db = "refseq_protein", # database to search in
|
|
|
|
# nHits = 3000, # 720 hits in 2017
|
|
|
|
# E = 0.01, #
|
|
|
|
# limits = "txid4751[ORGN]") # = fungi
|
|
|
|
# save(BLASThits, file="data/BLASThits.RData")
|
2017-09-12 20:09:20 +00:00
|
|
|
load(file="data/BLASThits.RData")
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# == 3.2 Identify species in "hits" ========================================
|
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
# This is a very big list that can't be usefully analyzed manually. Here
|
|
|
|
# we are only interested in the species names that it contains.
|
|
|
|
|
|
|
|
# How many hits in the list?
|
2017-09-21 21:49:55 +00:00
|
|
|
length(BLASThits$hits)
|
|
|
|
|
|
|
|
# Let's look at a hit somewhere down the list
|
|
|
|
str(BLASThits$hit[[277]])
|
|
|
|
|
|
|
|
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
|
|
|
|
# in a useful way. The species information is in the $species element of every
|
|
|
|
# hit.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# Run a loop to extract all the species names into a vector. We subset ...
|
|
|
|
# Blasthits$hits ... the list of hits, from which we choose ...
|
|
|
|
# Blasthits$hits[[i]] ... the i-th hit, and get ...
|
|
|
|
# Blasthits$hits[[i]]$species ... the species element from that.
|
|
|
|
# Subsetting FTW.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
BLASTspecies <- character()
|
2017-09-21 21:49:55 +00:00
|
|
|
for (i in seq_along(BLASThits$hits)) {
|
|
|
|
BLASTspecies[i] <-BLASThits$hits[[i]]$species
|
2017-09-12 20:09:20 +00:00
|
|
|
}
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# You can confirm that BLASTspecies has the expected size.
|
|
|
|
length(BLASTspecies)
|
|
|
|
|
|
|
|
# Again, some species appear more than once, e.g. ...
|
2017-09-12 20:09:20 +00:00
|
|
|
sum(BLASTspecies == "Saccharomyces cerevisiae")
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
|
|
|
|
|
|
|
|
# Therefore we use unique() to throw out duplicates:
|
2017-09-12 20:09:20 +00:00
|
|
|
BLASTspecies <- unique(BLASTspecies)
|
|
|
|
|
|
|
|
length(BLASTspecies)
|
2017-09-21 21:49:55 +00:00
|
|
|
# i.e. we got rid of about two thirds of the hits.
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# You should think about this: what is the biological interpretation of the
|
|
|
|
# finding that on average we have three sequences that are similar to Mbp1 in
|
|
|
|
# other species?
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# = 4 Intersect GOLD and BLAST species ====================================
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
# Now we can compare the two lists for species that appear in both sources: the
|
|
|
|
# simplest way is to use the set operation functions union(), intersection()
|
|
|
|
# etc. See here:
|
|
|
|
?union
|
|
|
|
|
2017-10-04 03:38:48 +00:00
|
|
|
MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
# Again: interpret this:
|
|
|
|
# - what is the number of GOLDspecies?
|
|
|
|
# - what is the number of BLAST species?
|
|
|
|
# - how many species are present in both lists?
|
|
|
|
# - what does it mean if a species is in GOLD but not in the BLAST list?
|
|
|
|
# - what does it mean if a species has been found during BLAST, but it
|
|
|
|
# is not in GOLD?
|
|
|
|
|
|
|
|
|
|
|
|
# = 5 Cleanup and finish ==================================================
|
|
|
|
|
|
|
|
# One final thing: some of the species will be our so-called "reference" species
|
|
|
|
# which we use for model solutions and examples in the course. They are defined
|
|
|
|
# in the .utilities.R file of this project. We remove them from the list so that
|
|
|
|
# we don't inadvertently assign them.
|
2017-09-12 20:09:20 +00:00
|
|
|
#
|
|
|
|
|
|
|
|
REFspecies
|
|
|
|
|
2017-10-04 03:38:48 +00:00
|
|
|
MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
|
2017-09-12 20:09:20 +00:00
|
|
|
|
2017-10-04 03:38:48 +00:00
|
|
|
# save(MYSPEspecies, file = "data/MYSPEspecies.RData")
|
2017-09-12 20:09:20 +00:00
|
|
|
|
|
|
|
|
2017-09-21 21:49:55 +00:00
|
|
|
|
|
|
|
|
2017-09-12 20:09:20 +00:00
|
|
|
# [END]
|