# ABC_makeYFOlist.R # # Purpose: Create a list of genome sequenced fungi with protein annotations and # Mbp1 homologues. # # Version: 1.1 # # Date: 2016 09 - 2017 08 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # V 1.1 Update 2017 # V 1.0 First code 2016 # # TODO: # actually rerun for 2017 # type out workflow # # ============================================================================== # DO NOT source() THIS FILE! # This file is code I provide for your deeper understanding of a process and # to provide you with useful sample code. It is not actually necessary for # you to run this code, but I encourage you to read it carefully and discuss # if there are parts you don't understand. # Run the commands that interact with the NCBI servers only if you want to # experiment with the code and/or parameters. I have commented out those # parts. If you simply want to reproduce the process you can simply # load() the respective intermediate results. # ============================================================================== # CREATING A YFO LIST # ============================================================================== # This script will create a list of "YFO" species and save it in an R object # YFOspecies that is stored in the data subdirectory of this project from where # it can be loaded. # ==== GOLD species ============================================================ # # Fetch and parse genome data from the NCBI genome project database # === Initialize if (!require(httr)) { # httr provides interfaces to Webservers on the Internet install.packages("httr") library(httr) } # The URL where the genome data can be downloaded URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt" # Read the data directly from the NCBI ftp server and put it into a dataframe. # This will take about a minute. # GOLDdata <- read.csv(URL, # header = TRUE, # sep = "\t", # stringsAsFactors = FALSE) # save(GOLDdata, file="data/GOLDdata.RData") load(file="data/GOLDdata.RData") # What columns does the table have, how is it structured? str(GOLDdata) # What groups of organisms are in the table? How many of each? table(GOLDdata$Group) # What subgroups of fungi do we have? table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"]) # How many of the fungi have protein annotations? sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-") # Get a subset of the data, with fungi that have protein annotations GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" & GOLDdata$Proteins != "-" , ] # check what we have in the table head(GOLDfungi) # For our purpose of defining species, we pick only the first two words from the # "X.Organism.Name" column ... here is a function to do this: # makeBinomial <- function(s) { # input: # s: a string which is expected to contain a binomial # species name as the first two words, followed by other text # output: # the first two words separeted by a single blank # x <- unlist(strsplit(s, "\\s+")) # split second element on # one or more whitespace return(paste(x[1:2], collapse=" ")) # return first two elements } # iterate through GOLDdata and extract species names GOLDspecies <- character() for (i in 1:nrow(GOLDfungi)) { GOLDspecies[i] <- makeBinomial(GOLDfungi$X.Organism.Name[i]) } # Species of great interest may may appear more than once, one for each sequenced strain: e.g. brewer's yeast: sum(GOLDspecies == "Saccharomyces cerevisiae") # Therefore we use the function unique() to throw out duplicates. Simple: GOLDspecies <- unique(GOLDspecies) length(GOLDspecies) # i.e. we got rid of about half of the species. # ==== BLAST species =========================================================== # # Use BLAST to fetch proteins related to Mbp1 and identifying the species that # contain them. # # Scripting agains NCBI APIs is not exactly enjoyable - there is usually a fair # amount of error handling involved that is not supported by the API in a # principled way but requires rather ad hoc solutions. The code I threw # together to make a BLAST interface for the course is in the file BLAST.R # Feel encouraged to study how this works. source("BLAST.R") # load the function and its utilites # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq # hits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID # nHits = 1000, # 633 hits in 2016 # E = 0.01, # # limits = "txid4751[ORGN]") # = fungi # save(hits, file="data/BLASThits.RData") load(file="data/BLASThits.RData") # This is a very big list that can't be usefully analyzed manually. Here # we are only interested in the species names that it contains. # How many hits in the list? length(hits$hits) # Let's look at the first one str(hits$hit[[1]]) # the species information is in the $def element - the definition line of the # sequence record ... but we need a function to retrieve it. This one is a great # example, because it is really messy. Have a look: hits$hits[[1]]$def # We get so many species because the exact same hit has been found by BLAST in a # number of RefSeqs sequence records, all of which are different strains of # yeast. We only need one of those though, any one, so we shall parse out the # first one. For this, We will simply use the immensely versatile strsplit() # function, split on square brackets, take the second element of the resulting # array and run that through our makeBinomial function. # define the function (i.e. execute the lines below) parseDeflineSpecies <- function(s) { # input: # s: a string which is expected to contain a binomial # species name in square brackets embedded in other text # output: # the species name found the first bracketed string # x <- unlist(strsplit(s, "\\]|\\[")) # split on "]" or "[" characters return(makeBinomial(x[2])) # return Binomial name from # second element of vector } #test it parseDeflineSpecies(hits$hits[[1]]$def) parseDeflineSpecies(hits$hits[[11]]$def) parseDeflineSpecies(hits$hits[[111]]$def) # now run a simple loop to extract all the species names into a vector BLASTspecies <- character() for (i in 1:length(hits$hits)) { BLASTspecies[i] <- parseDeflineSpecies(hits$hits[[i]]$def) } # You can confirm in the Values section of the Environment pane that # BLASTspecies has the expected size. Again, species may appear more than once, # e.g. sum(BLASTspecies == "Saccharomyces cerevisiae") # Therefore we use the function unique() to throw out duplicates. Simple: BLASTspecies <- unique(BLASTspecies) length(BLASTspecies) # i.e. we got rid of about one third of the species. # # You should think about this: what does it mean that on average we have three # hits by sequence similarity to Mbp1 in other species? # ==== Intersecting BLAST and GOLD species lists =============================== # Now we can compare the two lists for species that appear in both sources: the # simplest way is to use the set operation functions union(), intersection() # etc. See here: ?union YFOspecies <- intersect(GOLDspecies, BLASTspecies) # Just one final thing: some of the species will be our so-called "reference" species for which I will develop model solutions. I have defined them in the .utilities.R file to make them available for future purposes. separately and remove them from the list. # REFspecies YFOspecies <- sort(setdiff(YFOspecies, REFspecies)) # save(YFOspecies, file = "data/YFOspecies.RData") # [END]