bch441-work-abc-units/ABC-makeYFOlist.R
2017-09-12 16:09:20 -04:00

221 lines
7.6 KiB
R

# ABC_makeYFOlist.R
#
# Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues.
#
# Version: 1.1
#
# Date: 2016 09 - 2017 08
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.1 Update 2017
# V 1.0 First code 2016
#
# TODO:
# actually rerun for 2017
# type out workflow
#
# ==============================================================================
# DO NOT source() THIS FILE!
# This file is code I provide for your deeper understanding of a process and
# to provide you with useful sample code. It is not actually necessary for
# you to run this code, but I encourage you to read it carefully and discuss
# if there are parts you don't understand.
# Run the commands that interact with the NCBI servers only if you want to
# experiment with the code and/or parameters. I have commented out those
# parts. If you simply want to reproduce the process you can simply
# load() the respective intermediate results.
# ==============================================================================
# CREATING A YFO LIST
# ==============================================================================
# This script will create a list of "YFO" species and save it in an R object
# YFOspecies that is stored in the data subdirectory of this project from where
# it can be loaded.
# ==== GOLD species ============================================================
#
# Fetch and parse genome data from the NCBI genome project database
# === Initialize
if (!require(httr)) { # httr provides interfaces to Webservers on the Internet
install.packages("httr")
library(httr)
}
# The URL where the genome data can be downloaded
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
# Read the data directly from the NCBI ftp server and put it into a dataframe.
# This will take about a minute.
# GOLDdata <- read.csv(URL,
# header = TRUE,
# sep = "\t",
# stringsAsFactors = FALSE)
# save(GOLDdata, file="data/GOLDdata.RData")
load(file="data/GOLDdata.RData")
# What columns does the table have, how is it structured?
str(GOLDdata)
# What groups of organisms are in the table? How many of each?
table(GOLDdata$Group)
# What subgroups of fungi do we have?
table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
# How many of the fungi have protein annotations?
sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
# Get a subset of the data, with fungi that have protein annotations
GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
GOLDdata$Proteins != "-" , ]
# check what we have in the table
head(GOLDfungi)
# For our purpose of defining species, we pick only the first two words from the
# "X.Organism.Name" column ... here is a function to do this:
#
makeBinomial <- function(s) {
# input:
# s: a string which is expected to contain a binomial
# species name as the first two words, followed by other text
# output:
# the first two words separeted by a single blank
#
x <- unlist(strsplit(s, "\\s+")) # split second element on
# one or more whitespace
return(paste(x[1:2], collapse=" ")) # return first two elements
}
# iterate through GOLDdata and extract species names
GOLDspecies <- character()
for (i in 1:nrow(GOLDfungi)) {
GOLDspecies[i] <- makeBinomial(GOLDfungi$X.Organism.Name[i])
}
# Species of great interest may may appear more than once, one for each sequenced strain: e.g. brewer's yeast:
sum(GOLDspecies == "Saccharomyces cerevisiae")
# Therefore we use the function unique() to throw out duplicates. Simple:
GOLDspecies <- unique(GOLDspecies)
length(GOLDspecies)
# i.e. we got rid of about half of the species.
# ==== BLAST species ===========================================================
#
# Use BLAST to fetch proteins related to Mbp1 and identifying the species that
# contain them.
#
# Scripting agains NCBI APIs is not exactly enjoyable - there is usually a fair
# amount of error handling involved that is not supported by the API in a
# principled way but requires rather ad hoc solutions. The code I threw
# together to make a BLAST interface for the course is in the file BLAST.R
# Feel encouraged to study how this works.
source("BLAST.R") # load the function and its utilites
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
# hits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# nHits = 1000, # 633 hits in 2016
# E = 0.01, #
# limits = "txid4751[ORGN]") # = fungi
# save(hits, file="data/BLASThits.RData")
load(file="data/BLASThits.RData")
# This is a very big list that can't be usefully analyzed manually. Here
# we are only interested in the species names that it contains.
# How many hits in the list?
length(hits$hits)
# Let's look at the first one
str(hits$hit[[1]])
# the species information is in the $def element - the definition line of the
# sequence record ... but we need a function to retrieve it. This one is a great
# example, because it is really messy. Have a look:
hits$hits[[1]]$def
# We get so many species because the exact same hit has been found by BLAST in a
# number of RefSeqs sequence records, all of which are different strains of
# yeast. We only need one of those though, any one, so we shall parse out the
# first one. For this, We will simply use the immensely versatile strsplit()
# function, split on square brackets, take the second element of the resulting
# array and run that through our makeBinomial function.
# define the function (i.e. execute the lines below)
parseDeflineSpecies <- function(s) {
# input:
# s: a string which is expected to contain a binomial
# species name in square brackets embedded in other text
# output:
# the species name found the first bracketed string
#
x <- unlist(strsplit(s, "\\]|\\[")) # split on "]" or "[" characters
return(makeBinomial(x[2])) # return Binomial name from
# second element of vector
}
#test it
parseDeflineSpecies(hits$hits[[1]]$def)
parseDeflineSpecies(hits$hits[[11]]$def)
parseDeflineSpecies(hits$hits[[111]]$def)
# now run a simple loop to extract all the species names into a vector
BLASTspecies <- character()
for (i in 1:length(hits$hits)) {
BLASTspecies[i] <- parseDeflineSpecies(hits$hits[[i]]$def)
}
# You can confirm in the Values section of the Environment pane that
# BLASTspecies has the expected size. Again, species may appear more than once,
# e.g.
sum(BLASTspecies == "Saccharomyces cerevisiae")
# Therefore we use the function unique() to throw out duplicates. Simple:
BLASTspecies <- unique(BLASTspecies)
length(BLASTspecies)
# i.e. we got rid of about one third of the species.
#
# You should think about this: what does it mean that on average we have three
# hits by sequence similarity to Mbp1 in other species?
# ==== Intersecting BLAST and GOLD species lists ===============================
# Now we can compare the two lists for species that appear in both sources: the
# simplest way is to use the set operation functions union(), intersection()
# etc. See here:
?union
YFOspecies <- intersect(GOLDspecies, BLASTspecies)
# Just one final thing: some of the species will be our so-called "reference" species for which I will develop model solutions. I have defined them in the .utilities.R file to make them available for future purposes. separately and remove them from the list.
#
REFspecies
YFOspecies <- sort(setdiff(YFOspecies, REFspecies))
# save(YFOspecies, file = "data/YFOspecies.RData")
# [END]