221 lines
7.6 KiB
R
221 lines
7.6 KiB
R
|
# ABC_makeYFOlist.R
|
||
|
#
|
||
|
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||
|
# Mbp1 homologues.
|
||
|
#
|
||
|
# Version: 1.1
|
||
|
#
|
||
|
# Date: 2016 09 - 2017 08
|
||
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||
|
#
|
||
|
# V 1.1 Update 2017
|
||
|
# V 1.0 First code 2016
|
||
|
#
|
||
|
# TODO:
|
||
|
# actually rerun for 2017
|
||
|
# type out workflow
|
||
|
#
|
||
|
# ==============================================================================
|
||
|
|
||
|
# DO NOT source() THIS FILE!
|
||
|
|
||
|
# This file is code I provide for your deeper understanding of a process and
|
||
|
# to provide you with useful sample code. It is not actually necessary for
|
||
|
# you to run this code, but I encourage you to read it carefully and discuss
|
||
|
# if there are parts you don't understand.
|
||
|
|
||
|
# Run the commands that interact with the NCBI servers only if you want to
|
||
|
# experiment with the code and/or parameters. I have commented out those
|
||
|
# parts. If you simply want to reproduce the process you can simply
|
||
|
# load() the respective intermediate results.
|
||
|
|
||
|
|
||
|
# ==============================================================================
|
||
|
# CREATING A YFO LIST
|
||
|
# ==============================================================================
|
||
|
|
||
|
# This script will create a list of "YFO" species and save it in an R object
|
||
|
# YFOspecies that is stored in the data subdirectory of this project from where
|
||
|
# it can be loaded.
|
||
|
|
||
|
|
||
|
# ==== GOLD species ============================================================
|
||
|
#
|
||
|
# Fetch and parse genome data from the NCBI genome project database
|
||
|
|
||
|
# === Initialize
|
||
|
if (!require(httr)) { # httr provides interfaces to Webservers on the Internet
|
||
|
install.packages("httr")
|
||
|
library(httr)
|
||
|
}
|
||
|
|
||
|
# The URL where the genome data can be downloaded
|
||
|
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
|
||
|
|
||
|
# Read the data directly from the NCBI ftp server and put it into a dataframe.
|
||
|
# This will take about a minute.
|
||
|
|
||
|
# GOLDdata <- read.csv(URL,
|
||
|
# header = TRUE,
|
||
|
# sep = "\t",
|
||
|
# stringsAsFactors = FALSE)
|
||
|
# save(GOLDdata, file="data/GOLDdata.RData")
|
||
|
load(file="data/GOLDdata.RData")
|
||
|
|
||
|
|
||
|
# What columns does the table have, how is it structured?
|
||
|
str(GOLDdata)
|
||
|
|
||
|
# What groups of organisms are in the table? How many of each?
|
||
|
table(GOLDdata$Group)
|
||
|
|
||
|
# What subgroups of fungi do we have?
|
||
|
table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
|
||
|
|
||
|
# How many of the fungi have protein annotations?
|
||
|
sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
|
||
|
|
||
|
# Get a subset of the data, with fungi that have protein annotations
|
||
|
GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
|
||
|
GOLDdata$Proteins != "-" , ]
|
||
|
|
||
|
# check what we have in the table
|
||
|
head(GOLDfungi)
|
||
|
|
||
|
# For our purpose of defining species, we pick only the first two words from the
|
||
|
# "X.Organism.Name" column ... here is a function to do this:
|
||
|
#
|
||
|
|
||
|
makeBinomial <- function(s) {
|
||
|
# input:
|
||
|
# s: a string which is expected to contain a binomial
|
||
|
# species name as the first two words, followed by other text
|
||
|
# output:
|
||
|
# the first two words separeted by a single blank
|
||
|
#
|
||
|
x <- unlist(strsplit(s, "\\s+")) # split second element on
|
||
|
# one or more whitespace
|
||
|
return(paste(x[1:2], collapse=" ")) # return first two elements
|
||
|
}
|
||
|
|
||
|
# iterate through GOLDdata and extract species names
|
||
|
GOLDspecies <- character()
|
||
|
for (i in 1:nrow(GOLDfungi)) {
|
||
|
GOLDspecies[i] <- makeBinomial(GOLDfungi$X.Organism.Name[i])
|
||
|
}
|
||
|
|
||
|
|
||
|
# Species of great interest may may appear more than once, one for each sequenced strain: e.g. brewer's yeast:
|
||
|
sum(GOLDspecies == "Saccharomyces cerevisiae")
|
||
|
|
||
|
# Therefore we use the function unique() to throw out duplicates. Simple:
|
||
|
GOLDspecies <- unique(GOLDspecies)
|
||
|
|
||
|
length(GOLDspecies)
|
||
|
# i.e. we got rid of about half of the species.
|
||
|
|
||
|
|
||
|
|
||
|
# ==== BLAST species ===========================================================
|
||
|
#
|
||
|
# Use BLAST to fetch proteins related to Mbp1 and identifying the species that
|
||
|
# contain them.
|
||
|
#
|
||
|
# Scripting agains NCBI APIs is not exactly enjoyable - there is usually a fair
|
||
|
# amount of error handling involved that is not supported by the API in a
|
||
|
# principled way but requires rather ad hoc solutions. The code I threw
|
||
|
# together to make a BLAST interface for the course is in the file BLAST.R
|
||
|
# Feel encouraged to study how this works.
|
||
|
|
||
|
source("BLAST.R") # load the function and its utilites
|
||
|
|
||
|
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
||
|
# hits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
||
|
# nHits = 1000, # 633 hits in 2016
|
||
|
# E = 0.01, #
|
||
|
# limits = "txid4751[ORGN]") # = fungi
|
||
|
# save(hits, file="data/BLASThits.RData")
|
||
|
load(file="data/BLASThits.RData")
|
||
|
|
||
|
# This is a very big list that can't be usefully analyzed manually. Here
|
||
|
# we are only interested in the species names that it contains.
|
||
|
|
||
|
# How many hits in the list?
|
||
|
length(hits$hits)
|
||
|
|
||
|
# Let's look at the first one
|
||
|
str(hits$hit[[1]])
|
||
|
|
||
|
# the species information is in the $def element - the definition line of the
|
||
|
# sequence record ... but we need a function to retrieve it. This one is a great
|
||
|
# example, because it is really messy. Have a look:
|
||
|
hits$hits[[1]]$def
|
||
|
|
||
|
# We get so many species because the exact same hit has been found by BLAST in a
|
||
|
# number of RefSeqs sequence records, all of which are different strains of
|
||
|
# yeast. We only need one of those though, any one, so we shall parse out the
|
||
|
# first one. For this, We will simply use the immensely versatile strsplit()
|
||
|
# function, split on square brackets, take the second element of the resulting
|
||
|
# array and run that through our makeBinomial function.
|
||
|
|
||
|
# define the function (i.e. execute the lines below)
|
||
|
parseDeflineSpecies <- function(s) {
|
||
|
# input:
|
||
|
# s: a string which is expected to contain a binomial
|
||
|
# species name in square brackets embedded in other text
|
||
|
# output:
|
||
|
# the species name found the first bracketed string
|
||
|
#
|
||
|
x <- unlist(strsplit(s, "\\]|\\[")) # split on "]" or "[" characters
|
||
|
return(makeBinomial(x[2])) # return Binomial name from
|
||
|
# second element of vector
|
||
|
}
|
||
|
|
||
|
#test it
|
||
|
parseDeflineSpecies(hits$hits[[1]]$def)
|
||
|
parseDeflineSpecies(hits$hits[[11]]$def)
|
||
|
parseDeflineSpecies(hits$hits[[111]]$def)
|
||
|
|
||
|
# now run a simple loop to extract all the species names into a vector
|
||
|
BLASTspecies <- character()
|
||
|
for (i in 1:length(hits$hits)) {
|
||
|
BLASTspecies[i] <- parseDeflineSpecies(hits$hits[[i]]$def)
|
||
|
}
|
||
|
|
||
|
# You can confirm in the Values section of the Environment pane that
|
||
|
# BLASTspecies has the expected size. Again, species may appear more than once,
|
||
|
# e.g.
|
||
|
sum(BLASTspecies == "Saccharomyces cerevisiae")
|
||
|
|
||
|
# Therefore we use the function unique() to throw out duplicates. Simple:
|
||
|
BLASTspecies <- unique(BLASTspecies)
|
||
|
|
||
|
length(BLASTspecies)
|
||
|
# i.e. we got rid of about one third of the species.
|
||
|
#
|
||
|
# You should think about this: what does it mean that on average we have three
|
||
|
# hits by sequence similarity to Mbp1 in other species?
|
||
|
|
||
|
|
||
|
# ==== Intersecting BLAST and GOLD species lists ===============================
|
||
|
|
||
|
|
||
|
# Now we can compare the two lists for species that appear in both sources: the
|
||
|
# simplest way is to use the set operation functions union(), intersection()
|
||
|
# etc. See here:
|
||
|
?union
|
||
|
|
||
|
YFOspecies <- intersect(GOLDspecies, BLASTspecies)
|
||
|
|
||
|
# Just one final thing: some of the species will be our so-called "reference" species for which I will develop model solutions. I have defined them in the .utilities.R file to make them available for future purposes. separately and remove them from the list.
|
||
|
#
|
||
|
|
||
|
REFspecies
|
||
|
|
||
|
YFOspecies <- sort(setdiff(YFOspecies, REFspecies))
|
||
|
|
||
|
# save(YFOspecies, file = "data/YFOspecies.RData")
|
||
|
|
||
|
|
||
|
# [END]
|