This commit is contained in:
hyginn 2017-11-03 15:08:17 -04:00
parent 1d6170f417
commit 67abde8920

View File

@ -29,18 +29,18 @@
#TOC> #TOC>
#TOC> Section Title Line #TOC> Section Title Line
#TOC> --------------------------------------------------------- #TOC> ---------------------------------------------------------
#TOC> 1 The Biostrings package 57 #TOC> 1 The Biostrings Package 52
#TOC> 2 Getting Data into Biostrings Objects 91 #TOC> 2 Getting Data into Biostrings Objects 85
#TOC> 3 Working with Biostrings Objects 111 #TOC> 3 Working with Biostrings Objects 106
#TOC> 3.1 Properties 114 #TOC> 3.1 Properties 109
#TOC> 3.2 Subsetting 151 #TOC> 3.2 Subsetting 146
#TOC> 3.3 Operators 163 #TOC> 3.3 Operators 158
#TOC> 3.4 Transformations 170 #TOC> 3.4 Transformations 165
#TOC> 4 Getting Data out of Biostrings Objects 177 #TOC> 4 Getting Data out of Biostrings Objects 172
#TOC> 5 More 186 #TOC> 5 More 181
#TOC> 5.1 Views 188 #TOC> 5.1 Views 183
#TOC> 5.2 Iranges 200 #TOC> 5.2 Iranges 195
#TOC> 5.3 StringSets 206 #TOC> 5.3 StringSets 201
#TOC> #TOC>
#TOC> ========================================================================== #TOC> ==========================================================================
@ -49,7 +49,7 @@
# be using more of the biostrings functions. # be using more of the biostrings functions.
# = 1 The Biostrings package ============================================== # = 1 The Biostrings Package ==============================================
# First, we install and load the Biostrings package from bioconductor # First, we install and load the Biostrings package from bioconductor
@ -62,13 +62,12 @@ if (! require(Biostrings, quietly=TRUE)) {
library(Biostrings) library(Biostrings)
} }
# Examine the ackage information: # Examine the package information:
library(help = Biostrings) # basic information library(help = Biostrings) # basic information
browseVignettes("Biostrings") # available vignettes browseVignettes("Biostrings") # available vignettes
data(package = "Biostrings") # available datasets data(package = "Biostrings") # available datasets
# At its core, Biostrings objects are "classes" of type XString (you can think # At its core, Biostrings objects are "classes" of type XString (you can think
# of a "class" in R as a special kind of list), that can take on particular # of a "class" in R as a special kind of list), that can take on particular
# flavours for RNA, DNA or amino acid sequence information. # flavours for RNA, DNA or amino acid sequence information.
@ -89,17 +88,18 @@ DNAString("AUG") # Error! No "U" in IUPAC DNA codes
# Example: read FASTA. Extract sequence. Convert to DNAString object. # Example: read FASTA. Extract sequence. Convert to DNAString object.
x <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") x <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
x <- dbSanitizeSequence(x) x <- dbSanitizeSequence(x)
myDNAseq <- DNAString(x) # takes the nucleotide sequence and conerts into a myDNAseq <- DNAString(x) # takes the nucleotide sequence and converts into a
# object of class DNAstring # object of class DNAstring
# Multi FASTA files can be read directly ... # Multi FASTA files can be read directly as a "XStringSet) ...
readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa") # Note: XStringSet (myDNASet <- readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa"))
# ... and if you subset one sequence from the set, you get an XString object # ... and if you subset one sequence from the set, you get an XString object
( x <- readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa")[[1]] ) # back again.
(Xseq <- myDNASet[[1]])
myDNAseq == x myDNAseq == Xseq # the comparison evaluates to TRUE ...
identical(myDNAseq, x) identical(myDNAseq, Xseq) # ... and indeed the objects are deemed identical.
@ -108,7 +108,7 @@ identical(myDNAseq, x)
# == 3.1 Properties ======================================================== # == 3.1 Properties ========================================================
str(myDNAseq) str(myDNAseq)
length(nchar(x)) # This gives you the _number of nucleotides_! length(myDNAseq) # This gives you the _number of nucleotides_!
# By comparison ... # By comparison ...
length(x) # ... is 1: one string only. To get the number of length(x) # ... is 1: one string only. To get the number of
# characters in a string, you need nchar(). # characters in a string, you need nchar().
@ -130,15 +130,15 @@ sum(letterFrequency(myDNAseq, c("G", "C"))) / length(myDNAseq) # GC contents
dinucleotideFrequency(myDNAseq) dinucleotideFrequency(myDNAseq)
barplot(sort(dinucleotideFrequency(myDNAseq)), cex.names = 0.5) barplot(sort(dinucleotideFrequency(myDNAseq)), cex.names = 0.5)
(x <- trinucleotideFrequency(myDNAseq)) (triNuc <- trinucleotideFrequency(myDNAseq))
barplot(sort(x), col="#4499EE33") barplot(sort(triNuc), col="#4499EE33")
x[x == max(x)] triNuc[triNuc == max(triNuc)]
x[x == min(x)] triNuc[triNuc == min(triNuc)]
max(x) / min(x) # AAA is more than 13 times as frequent as CGT max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
# compare to a shuffled sequence: # compare to a shuffled sequence:
(x <- trinucleotideFrequency(sample(myDNAseq))) (triNuc <- trinucleotideFrequency(sample(myDNAseq)))
barplot(sort(x), col="#EEEE4433", add = TRUE) barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
# Interpret this plot. # Interpret this plot.