# RPR-Biostrings.R # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Biostrings unit. # # Version: 1.0 # # Date: 2017 10 20 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: # 1.0 2017 Revisions # 0.1 First code copied from 2016 material. # # # TODO: # # # == DO NOT SIMPLY source() THIS FILE! ======================================= # # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... # # ============================================================================== #TOC> ========================================================================== #TOC> #TOC> Section Title Line #TOC> --------------------------------------------------------- #TOC> 1 The Biostrings Package 52 #TOC> 2 Getting Data into Biostrings Objects 85 #TOC> 3 Working with Biostrings Objects 106 #TOC> 3.1 Properties 109 #TOC> 3.2 Subsetting 146 #TOC> 3.3 Operators 158 #TOC> 3.4 Transformations 165 #TOC> 4 Getting Data out of Biostrings Objects 172 #TOC> 5 More 181 #TOC> 5.1 Views 183 #TOC> 5.2 Iranges 195 #TOC> 5.3 StringSets 201 #TOC> #TOC> ========================================================================== # This is a very brief introduction to the biostrings package, other units will # be using more of the biostrings functions. # = 1 The Biostrings Package ============================================== # First, we install and load the Biostrings package from bioconductor if (! require(Biostrings, quietly=TRUE)) { if (! exists("biocLite")) { source("https://bioconductor.org/biocLite.R") } biocLite("Biostrings") library(Biostrings) } # Examine the package information: library(help = Biostrings) # basic information browseVignettes("Biostrings") # available vignettes data(package = "Biostrings") # available datasets # At its core, Biostrings objects are "classes" of type XString (you can think # of a "class" in R as a special kind of list), that can take on particular # flavours for RNA, DNA or amino acid sequence information. class(RNAString("AUG")) class(DNAString("ATG")) class(AAString("M")) # An essential property of Biostrings objects is that they only allow letters # from the applicable IUPAC alphabet: RNAString("AUG") DNAString("AUG") # Error! No "U" in IUPAC DNA codes # = 2 Getting Data into Biostrings Objects ================================ # Example: read FASTA. Extract sequence. Convert to DNAString object. x <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") x <- dbSanitizeSequence(x) myDNAseq <- DNAString(x) # takes the nucleotide sequence and converts into a # object of class DNAstring # Multi FASTA files can be read directly as a "XStringSet) ... (myDNASet <- readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa")) # ... and if you subset one sequence from the set, you get an XString object # back again. (Xseq <- myDNASet[[1]]) myDNAseq == Xseq # the comparison evaluates to TRUE ... identical(myDNAseq, Xseq) # ... and indeed the objects are deemed identical. # = 3 Working with Biostrings Objects ===================================== # == 3.1 Properties ======================================================== str(myDNAseq) length(myDNAseq) # This gives you the _number of nucleotides_! # By comparison ... length(x) # ... is 1: one string only. To get the number of # characters in a string, you need nchar(). nchar(x) # However ... nchar(myDNAseq) # ... also works. uniqueLetters(myDNAseq) # Count frequencies - with strings, you would strsplit() into a character # vector and then use table(). biost alphabetFrequency(myDNAseq) # letterFrequency() works with a defined alphabet - such as what uniqueLetters() # returns. letterFrequency(myDNAseq, uniqueLetters(myDNAseq)) sum(letterFrequency(myDNAseq, c("G", "C"))) / length(myDNAseq) # GC contents dinucleotideFrequency(myDNAseq) barplot(sort(dinucleotideFrequency(myDNAseq)), cex.names = 0.5) (triNuc <- trinucleotideFrequency(myDNAseq)) barplot(sort(triNuc), col="#4499EE33") triNuc[triNuc == max(triNuc)] triNuc[triNuc == min(triNuc)] max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT # compare to a shuffled sequence: (triNuc <- trinucleotideFrequency(sample(myDNAseq))) barplot(sort(triNuc), col="#EEEE4433", add = TRUE) # Interpret this plot. # == 3.2 Subsetting ======================================================== # Subsetting any XString object works as expected: myDNAseq[4:15] # ... well - maybe not expected, because x[4:15] would not work. # Alternatively to the "[" operator, use the subseq() function - especially for # long sequences. This is far more efficient. subseq(myDNAseq, start = 1, end = 30) # == 3.3 Operators ========================================================= # RNAstring() and DNAstring() objects compare U and T as equals! RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") # == 3.4 Transformations =================================================== myDNAseq[4:15] reverseComplement(myDNAseq[4:15]) translate(myDNAseq[4:15]) # = 4 Getting Data out of Biostrings Objects ============================== # If you need a character object, use toString(): toString(myDNAseq[4:15]) # save() and load() works like on all other R objects. # = 5 More ================================================================ # == 5.1 Views ============================================================= # Biostring "Views" are objects that store multiple substrings of one # Biostring object. (myView <- Views(myDNAseq, start = c(1, 19, 37), end = c(15, 30, 45))) # Views are convenient to store feature annotations names(myView) <- c("Feature-A", "Feature-B", "Feature-C") cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView )) # == 5.2 Iranges =========================================================== # Biostrings Iranges are like Views with a common start point. These can be # useful for feature annotations. Instead of start/end you store start/width. # == 5.3 StringSets ======================================================== # Biostring "StringSets" store multiple sequences. # ompA <- AAString("MKKTAIAIAVALAGFATVAQA") sample(ompA) # sample can work directly on a Biostring object to shuffle it x[1] <- toString(ompA) for (i in 2:10) { x[i] <- toString(sample(ompA)) } shuffledPeptideSet <- AAStringSet(x) names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep="")) shuffledPeptideSet length(shuffledPeptideSet) width(shuffledPeptideSet) alphabetFrequency(shuffledPeptideSet) # [END]