From 553b042f2094788c5e7c9ed4774141d7143cc42b Mon Sep 17 00:00:00 2001 From: hyginn Date: Sat, 28 Oct 2017 23:09:21 -0400 Subject: [PATCH] MSA unit, and supporting files --- .utilities.R | 7 +- BIN-ALI-MSA.R | 675 +++++++++++++++++++++++++++++++++-------- scripts/ABC-writeALN.R | 135 +++++++++ scripts/ABC-writeMFA.R | 115 +++++++ 4 files changed, 803 insertions(+), 129 deletions(-) create mode 100644 scripts/ABC-writeALN.R create mode 100644 scripts/ABC-writeMFA.R diff --git a/.utilities.R b/.utilities.R index ad19221..4339672 100644 --- a/.utilities.R +++ b/.utilities.R @@ -2,10 +2,11 @@ # # Miscellaneous R code to suppport the project # -# Version: 1.2 -# Date: 2017 09 +# Version: 1.3 +# Date: 2017 09 - 2017 10 # Author: Boris Steipe # +# V 1.3 load msa support functions # V 1.2 update database utilities to support 2017 version of JSON sources # V 1.1 2017 updates for ABC-units # V 1.0 First code @@ -18,6 +19,8 @@ # ====== SCRIPTS ============================================================= source("./scripts/ABC-dbUtilities.R") +source("./scripts/ABC-writeALN.R") +source("./scripts/ABC-writeMFA.R") # ====== SUPPORT FUNCTIONS ===================================================== diff --git a/BIN-ALI-MSA.R b/BIN-ALI-MSA.R index 8f1a28b..5723148 100644 --- a/BIN-ALI-MSA.R +++ b/BIN-ALI-MSA.R @@ -3,94 +3,148 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-MSA unit. # -# Version: 0.1 +# Version: 1.0 # -# Date: 2017 08 28 +# Date: 2017 10 23 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.0 Fully refactored and rewritten for 2017 # 0.1 First code copied from 2016 material. - +# # # TODO: # # # == DO NOT SIMPLY source() THIS FILE! ======================================= - +# # If there are portions you don't understand, use R's help system, Google for an # answer, or ask your instructor. Don't continue if you don't understand what's # going on. That's not how it works ... - +# # ============================================================================== -# = 1 Multiple sequence alignment -# We will compute a multiple sequence alignment using the "muscle" algorithm -# which is available throught the Bioconductor msa package. +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> ------------------------------------------------------------ +#TOC> 1 Preparations 51 +#TOC> 2 Aligning full length MBP1 proteins 99 +#TOC> 2.1 Preparing Sequences 110 +#TOC> 2.2 Compute the MSA 135 +#TOC> 3 Analyzing an MSA 156 +#TOC> 4 Comparing MSAs 227 +#TOC> 4.1 Importing an alignment to msa 236 +#TOC> 4.1.1 importing an .aln file 245 +#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 276 +#TOC> 4.2 More alignments 313 +#TOC> 4.3 Computing comparison metrics 325 +#TOC> 5 Profile-Profile alignments 462 +#TOC> 6 Sequence Logos 539 +#TOC> 6.1 Subsetting an alignment by motif 548 +#TOC> 6.2 Plot a Sequence Logo 591 +#TOC> +#TOC> ========================================================================== -if (!require(Biostrings, quietly=TRUE)) { - source("https://bioconductor.org/biocLite.R") + +# = 1 Preparations ======================================================== + +# You need to reload you protein database, including changes that might +# have been made to the reference files. If you have worked with the +# prerequiste units, you should have a script named "makeProteinDB.R" +# that will create the myDB object with aprotein and feature database. +# Ask for advice if not. +source("makeProteinDB.R") + + +# Multiple sequence alignment algorithms are provided in +# the Bioconductor msa package. + +if (! require(Biostrings, quietly=TRUE)) { + if (! exists("biocLite")) { + source("https://bioconductor.org/biocLite.R") + } biocLite("Biostrings") + library(Biostrings) } -data(BLOSUM62) +# Package information: +# library(help = Biostrings) # basic information +# browseVignettes("Biostrings") # available vignettes +# data(package = "Biostrings") # available datasets -if (!require(msa, quietly=TRUE)) { - source("https://bioconductor.org/biocLite.R") + +if (! require(msa, quietly=TRUE)) { + if (! exists("biocLite")) { + source("https://bioconductor.org/biocLite.R") + } biocLite("msa") library(msa) } +# Package information: +# library(help=msa) # basic information +# browseVignettes("msa") # available vignettes +# data(package = "msa") # available datasets -# If the installation asks you if you want to update older packages, always -# answer "a" for "all" unless you have an important reason not to. But if the -# installer asks you whether you want to compile from source, answer"n" for "no" -# unless you need new functionality in a particular bleeding-edge version of a -# package. + +# If an installation asks you if you want to update older packages, I recommend +# to always answer "a" for "all" unless you have an important reason not to. But +# if the installer asks you whether you want to compile from source, answer "n" +# for "no" unless you need new functionality in a particular bleeding-edge +# version of a package. help(package = "msa") -# We have used biostrings' AAString() function before; for multiple -# alignments we need an AAStringSet(). We can simply feed it -# a vector of sequences: -# Let's make a shorthand for selection of Mbp1 proteins from our database: a -# vector of indices for all of the rows in the protein table that contain -# proteins whose name begins with MBP1. -iMbp1Proteins <- grep("^MBP1_", myDB$protein$name) +# = 2 Aligning full length MBP1 proteins ================================== -# Next we create an AAStringSet for all of those proteins -seqSet <- AAStringSet(myDB$protein$sequence[iMbp1Proteins]) +# In the Wiki part of this unit you have +# - aligned full length MBP1 protein sequences at the EBI using T-Coffee +# - saved the resulting alignment in CLUSTAL format +# to the file "MBP1orthologuesTC.aln" -# ... and align (which is very simple) ... -msaMuscle( - seqSet, - order = "aligned") - -# ... but to help us make sense of the alignment we need to add the names for -# the sequences. names for a seqSet object are held in the ranges slot... - -seqSet@ranges@NAMES <- myDB$protein$name[iMbp1Proteins] - -seqSet - -# This little step of adding names is actually really -# very important. That's because the aligned sequences -# are meaningless strings of characters unless we can -# easily identify their biological relationships. -# Creating MSAs that are only identified by e.g. their -# RefSeq ids is a type of cargo-cult bioinformatics -# that we encounter a lot. The point of the alignment -# is not to create it, but to interpret it! +# In this section we will calculate an MSA of the same sequences using +# algorithms in the msa packages, and we will compare the two alignments. -# Let's align again, and assign the result ... -msa1 <- msaMuscle( - seqSet, - order = "aligned") +# == 2.1 Preparing Sequences =============================================== -msa1 +# We have used the biostrings' AAString() function before; for multiple +# alignments we need an AAStringSet(). AAStringSets are produced from vectors +# of sequence. + +sel <- grep("MBP1", myDB$protein$name) +MBP1set <- AAStringSet(myDB$protein$sequence[sel]) + +# To help us make sense of the alignment we need to add the names for +# the sequences. Names for a seqSet object are held in the ranges slot... + +MBP1set@ranges@NAMES <- myDB$protein$name[sel] + +MBP1set + +# You should have eleven sequences in this set, ask for advice if you don't. + +# The little step of adding names is actually really very important. That's +# because the aligned sequences are meaningless strings of characters unless we +# can easily identify their biological relationships. Creating MSAs that are +# only identified by e.g. their RefSeq ids is a type of cargo-cult +# bioinformatics that we encounter a lot. The point of the alignment is not to +# create it, but to interpret it! + +# == 2.2 Compute the MSA =================================================== + +# The alignment itself is very simple. msa has msaMuscle() and msaClustalOmega() +# to produce alignments. (It also has msaClustalW() which is kind of +# embarrassing since that hasn't been the algorithm of first choice for +# decades. Don't use that one for real work.) + + +# Let's run an alignment with "Muscle" +(msaM <- msaMuscle( MBP1set, order = "aligned")) # ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method): -print(msa1, show=c("alignment", "complete"), showConsensus=FALSE) +print(msaM, show=c("alignment", "complete"), showConsensus=FALSE) # You see that the alignment object has sequence strings with hyphens as @@ -98,93 +152,460 @@ print(msa1, show=c("alignment", "complete"), showConsensus=FALSE) # the order has not been preserved, but the most similar sequences are now # adjacent to each other. -# Lets write the alignment to one of the common file formats: a multi-fasta -# file. -# Why oh why does the msa package not have a function to do this !!! -# Like, seriously ... +# = 3 Analyzing an MSA ==================================================== -# ==== writeMFA +# You probabaly realize that computing an MSA is not that hard. It's not +# entirely trivial to collect meaningful sequences via e.g. PSI-BLAST ... but +# then computing the alignment gives you a result quickly. But what does it +# mean? What information does the MSA contain? -# Here is our own function to write a AAStringSet object to a multi-FASTA file. -writeMFA <- function(ali, file, blockSize = 50) { - if (missing(ali)) { - stop("Input object missing from arguments with no default.") - } - if (missing(file)) { - writeToFile <- FALSE - } - else { - writeToFile <- TRUE - sink(file) # divert output to file - } - # Extract the raw data from the objects depending on - # their respective class and put this - # into a named vector of strings. - if (class(ali)[1] == "MsaAAMultipleAlignment") { - strings <- character(nrow(ali)) - for (i in 1:nrow(ali)) { - strings[i] <- as.character(ali@unmasked[i]) - names(strings)[i] <- ali@unmasked@ranges@NAMES[i] - } - } - else if (class(ali)[1] == "AAStringSet") { - strings <- character(length(ali)) - for (i in 1:length(ali)) { - strings[i] <- as.character(ali[i]) - names(strings)[i] <- ali@ranges@NAMES[i] - } - } - else { - stop(paste("Input object of class", - class(ali)[1], - "can't be handled by this function.")) - } +# Let's have a first look at conserved vs. diverged regions of the MSA. msa +# provides the function msaConservationScore() which outputs a vector of scores. +# The scores are the sum of pairscores for the column: for example a perfectly +# conserved column of histidines would have the following score in our MSA +# of eleven sequences: +# - one (H, H) pair score is 8 in BLOSUM62; +# - there are (n^2 - n) / 2 pairs that can be formed between amino acids +# in a column from n sequences; +# - therefore the column score is 8 * (11^2 - 11) / 2 == 440 + +data("BLOSUM62") # fetch the BLOSUM62 package from the Biostrings package + +msaMScores <- msaConservationScore(msaM, substitutionMatrix = BLOSUM62) +plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position") + +# That plot shows the well-aligned regions (domains ?) of the sequence, but it +# could use some smoothing. Options for smoothing such plots include calculating +# averages in sliding windows ("moving average"), and lowess() smoothing. Here +# is a quick demo of a moving average smoothing, to illustrate the principle. + +wRadius <- 15 # we take the mean of all values around a point +- wRadius +len <- length(msaMScores) +v <- msaMScores +for (i in (1 + wRadius):(len - wRadius)) { + v[i] <- mean(msaMScores[(i - wRadius):(i + wRadius)]) # mean of values in + # window around i +} +points(v, col = "#FFFFFF", type = "l", lwd = 4.5) +points(v, col = "#3DAEB2", type = "l", lwd = 3) - for (i in 1:length(strings)) { - # output FASTA header - cat(paste(">", - names(strings)[i], - "\n", - sep="")) - # output the sequence block by block ... - nLine <- 1 - from <- 1 - while (from < nchar(strings[i])) { - to <- from + blockSize - 1 - cat(paste(substr(strings[i], from, to), "\n", sep="")) - from <- to + 1 - } - cat("\n") # output an empty line - } - if (writeToFile) { - sink() # Done. Close the diversion. +# You can set a threshold and use rle() to define ranges of values that fall +# above and below the threshold, and thus approximate domain boundaries: +thrsh <- 30 +(highScoringRanges <- rle(v > thrsh)) +(idx <- c(1, cumsum(highScoringRanges$lengths))) +for (i in seq_along(highScoringRanges$lengths)) { + if (highScoringRanges$values[i] == TRUE) { # If this range is above threshold, + rect(idx[i], thrsh, idx[i+1], max(v), # ... draw a rectangle + col = "#205C5E33") # ... with a transparent color. + cat(sprintf("Possible domain from %d to %d\n", idx[i], idx[i+1])) } } -# confirm that the function works -writeMFA(seqSet) -writeMFA(msa1) +# Getting this right requires a bit of fiddling with the window radius and +# threshold (experiment with that a bit), but once we are satisfied, we can use +# the boundaries to print the MSA alignments separately for domains. +# Unfortunately the msa package provides no native way to extract blocks of the +# alignment for further processing, but your .utilities file loads a function to +# write alignment objects and sequence sets to .aln formatted output. Have a +# look: -# We could use this function to write the raw and aligned sequences to file like -# so: +writeALN -# writeMFA(seqSet, file = "APSES_proteins.mfa") -# writeMFA(msa1, file = "APSES_proteins_muscle.mfa") - -# ... but since we don't actually need the data on file now, just copy the code -# that would create the msa to your myCode file so you can quickly reproduce it. - -# == Task: -# Print the output of print(msa1) on a sheet of paper and bring it to -# class for the next quiz. - -# That'll do. +# Print out the aligned blocks +for (i in seq_along(highScoringRanges$lengths)) { + if (highScoringRanges$values[i] == TRUE) { # If this range is above threshold, + cat(sprintf("\n\nPossible domain from %d to %d\n", idx[i], idx[i+1])) + writeALN(msaM, range = c(idx[i], idx[i+1])) + } +} -# = 1 Tasks +# = 4 Comparing MSAs ====================================================== + +# Let's compare the results of different alignment algorithms. We computed a +# vector of scores above, and we can compare that for different alignment +# algorithms. This is not trivial, so we'll need to look at that data in +# different ways and explore it. But first, let's get more alignments to compare +# with. + + +# == 4.1 Importing an alignment to msa ===================================== + +# We computed a T-Coffee alignment at the EBI. msa has no native import function +# so we need to improvise, and it's a bit of a pain to do - but a good +# illustration of startegies to convert data into any kind of object: +# - read an .aln file +# - adjust the sequence names +# - convert to msaAAMultipleAlignment object + +# === 4.1.1 importing an .aln file + +# The seqinr package has a function to read CLUSTAL W formatted .aln files ... +if (! require(seqinr, quietly=TRUE)) { + install.packages(seqinr) + library(seqinr) +} +# Package information: +# library(help=seqinr) # basic information +# browseVignettes("seqinr") # available vignettes +# data(package = "seqinr") # available datasets + +# read the donwloaded file +tmp <- read.alignment("msaT.aln", format = "clustal") + +# read.alignment() returns a list. $seq is a list of strings, one for each +# complete alignment. However, they are converted to lower case. +x <- toupper(unlist(tmp$seq)) # get the sequences, uppercase + +# $nam contains the names. +(names(x) <- tmp$nam) + +# Note that the names in the file are refseq IDs, we need to replace the +# RefSeqIDs with the database names: +for (i in seq_along(x)) { + # find the index of the RefSeqID + id <- gsub("\\..*$", "", names(x)[i]) # fetch the name, drop the version + sel <- which(myDB$protein$RefSeqID == id) # get the index + names(x)[i] <- myDB$protein$name[sel] # get the name +} + +# === 4.1.2 Creating an MsaAAMultipleAlignment object + +# MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects +# in their @unmasked slot, and a few additional items. Rather then build the +# object from scratch, we copy an axisting object, and overwrite the dta in its +# slots with what we need. Our goal is pragmatic, we want an object that msa's +# functions will accept as input. + +# First: convert our named char vector into an AAstringSet +x <- AAStringSet(x) + +# Then: create a new MsaAAMultipleAlignment S4 object. The msa package has +# defined what such an object should look like, with the SetClass() function. To +# create a new object, simply use the new() function, define the class that the +# object would have, and fill the slots with something that has the right type. +# How do we know the right type and the slot names? Easy! We just use +# str() to get the information. + +str(msaM) + +msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object + unmasked = x, # "unmasked" slot takes an AASringSet + version = "T-Coffee", # "version" slot takes a string + params = list(), # "params" takes a list(), we leave the + # list empty, but we could add the + # alignment parameters that we used at + # the EBI here. + call = "imported from T-coffee alignment") # also a string + +str(msaT) + + +msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can + # use the msa package functions on it + +msaTScores <- msaConservationScore(msaT, substitutionMatrix = BLOSUM62) + +# == 4.2 More alignments =================================================== + +# Next, we calculate alignments with msa's two other alignment options: +# CLUSTAL Omega +(msaO <- msaClustalOmega( MBP1set, order = "aligned")) +msaOScores <- msaConservationScore(msaO, substitutionMatrix = BLOSUM62) + +# CLUSTAL W +(msaW <- msaClustalW( MBP1set, order = "aligned")) +msaWScores <- msaConservationScore(msaW, substitutionMatrix = BLOSUM62) + + +# == 4.3 Computing comparison metrics ====================================== + +# Ready to compare. + +# ... sum of alignment scores of alignment divided by sum of alignment scores +# of reference alignment (arbitrarily using CLUSTAL W as reference) + +sRef <- sum(msaWScores) +sum(msaWScores) / sRef # CLUSTAl W +sum(msaOScores) / sRef # CLUSTAL O +sum(msaTScores) / sRef # T-COFFEE +sum(msaMScores) / sRef # MUSCLE + +# ... mean alignment scores (higher is better) + +mean(msaWScores) # CLUSTAl W +mean(msaOScores) # CLUSTAL O +mean(msaTScores) # T-COFFEE +mean(msaMScores) # MUSCLE + +# total number of gaps (lower is better) +countGaps <- function(ali) { + x <- paste0(as.character(ali), collapse = "") + aa <- nchar(gsub("-", "", x)) + return(nchar(x) - aa) +} + +countGaps(msaW) # CLUSTAl W +countGaps(msaO) # CLUSTAL O +countGaps(msaT) # T-COFFEE +countGaps(msaM) # MUSCLE + +# number of indels in alignment (lower is less fragmented) +countIndels <- function(ali) { + x <- paste0(as.character(ali), collapse = "") # collapse into single string + x <- unlist(strsplit(x, "")) # split into characters + x <- x == "-" # convert into boolean + x <- rle(x) # calculate rle + # every run of TRUE is one indel event + return(sum(x$values)) +} + +countIndels(msaW) # CLUSTAl W +countIndels(msaO) # CLUSTAL O +countIndels(msaT) # T-COFFEE +countIndels(msaM) # MUSCLE + +# Let's look at the distribution of alignment scores: +boxplot(list(CLUSTAL.W = msaWScores, + CLUSTAL.O = msaOScores, + T.COFFEE = msaTScores, + MUSCLE = msaMScores), + col = c("#7D556488", "#74628F88", "#5E78A188", "#3DAEB288")) + +# CLUSTAL W and CLUSTAL O don't look all that different. T-Coffee tends to have +# a tighter distribution with less negative scores. Muscle has a slightly higher +# mean and generally higher scores. + +# Boxplots are convenient, but don't give us much detail about the shape of the +# distribution. For that, we need histograms, or density plots. + +plot(density(msaWScores), + type = "l", + col = "#7D5564", + lwd = 1.5, + ylim = c(0, (max(density(msaWScores)$y) * 1.3)), + main = "Comparing MSA algorithms", + xlab = "Alignment Score", + ylab = "Density") +points(density(msaOScores), + type = "l", + lwd = 1.5, + col = "#74628F") +points(density(msaTScores), + type = "l", + lwd = 1.5, + col = "#5E78A1") +points(density(msaMScores), + type = "l", + lwd = 1.5, + col = "#3DAEB2") +legend("topright", + legend = c("MUSCLE", "T-COFFEE", "CLUSTAL O", "CLUSTAL W"), + col = c("#3DAEB2", "#5E78A1", "#74628F", "#7D5564"), + lwd = 2, + cex = 0.7, + bty = "n") + +# The desnity plots confirm in more detail that CLUSTAL W misses some of the +# higher-scoring possibilities, that wherever CLUSTAL O is bad, it is quite bad, +# that T-COFFEE has fewer poorly scoring columns but misses some of the better +# scoring possibilities, and that MUSCLE appears to do best overall. + +# Can we attribute these differences to sections of the alignment in which the +# algorithms did better or worse? Let's plot the scores cumulatively. The +# alignments have different lengths, so we plot the scores on the respective +# fraction of the alignement length. + +plot(seq(0, 1, length.out = length(msaWScores)), # x- axis: fraction of length + cumsum(msaWScores), + type = "l", + col = "#7D5564", + lwd = 1.5, + ylim = c(0, max(cumsum(msaMScores))), + main = "Comparing MSA algorithms", + xlab = "Alignment Position", + ylab = "Cumulative Alignment Score") +points(seq(0, 1, length.out = length(msaOScores)), # x- axis: fraction of length + cumsum(msaOScores), + type = "l", + lwd = 1.5, + col = "#74628F") +points(seq(0, 1, length.out = length(msaTScores)), # x- axis: fraction of length + cumsum(msaTScores), + type = "l", + lwd = 1.5, + col = "#5E78A1") +points(seq(0, 1, length.out = length(msaMScores)), # x- axis: fraction of length + cumsum(msaMScores), + type = "l", + lwd = 1.5, + col = "#3DAEB2") +legend("bottomright", + legend = c("MUSCLE", "T-COFFEE", "CLUSTAL O", "CLUSTAL W"), + col = c("#3DAEB2", "#5E78A1", "#74628F", "#7D5564"), + lwd = 2, + cex = 0.7, + bty = "n") + +# Your alignment is going to be differnte from mine, due to the inclusion of +# MYSPE - but what I see is that MUSCLE gives the highest score overall, and +# achieves this with fewer indels then most, and the lowest number of gaps of +# all algorithms. + +# To actually compare regions of alignments, we need to align alignments. + + +# = 5 Profile-Profile alignments ========================================== + + +# Profile-profile alignments are the most powerful way to pick up distant +# relationships between sequence families. The can be used, for example to build +# a profile from structural superpositions of crystal structures, and then map a +# MSA alignment onto those features. Here we will use profile-profile comparison +# to compare two MSAs with each other, by aligning them. The algorithm is +# provided by the DECIPHER package. + +if (! require(DECIPHER, quietly=TRUE)) { + if (! exists("biocLite")) { + source("https://bioconductor.org/biocLite.R") + } + biocLite("DECIPHER") + library(DECIPHER) +} +# Package information: +# library(help = DECIPHER) # basic information +# browseVignettes("DECIPHER") # available vignettes +# data(package = "DECIPHER") # available datasets + +# AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and +# CLUSTAL W alignments: we could do this directly ... +AlignProfiles(msaW@unmasked, msaM@unmasked) + +# But for ease of comparison, we'll reorder the sequences of the CLUSTAL W +# alignment into the same order as the MUSCLE alignment: +m <- as.character(msaM) +w <- as.character(msaW)[names(m)] + +(ppa <- AlignProfiles(AAStringSet(w), AAStringSet(m))) + +# Conveniently, AlignProfiles() returns an AAStringSet, so we can use our +# writeALN function to show it. Here is an arbitrary block, from somewhere in +# the middle of the alignment: + +writeALN(ppa, range = c(751, 810)) + +# If you look at this for a while, you can begin to figue out where the +# algorithms made different decisions about where to insert gaps, and how to +# move segments of sequence around. But matters become more clear if we +# post-process this profile-profile alignment. Let's replace all hyphens that +# the pp-alignment has inserted with blanks, and let's add a separator line down +# the middle between the two alignments. + +x <- unlist(strsplit(as.character(ppa), "")) # unlist all +dim(x) <- c(width(ppa)[1], length(ppa)) # form into matrix by columns +x <- t(x) # transpose the matrix +(a1 <- 1:(nrow(x)/2)) # rows of alignment 1 +(a2 <- ((nrow(x)/2) + 1):nrow(x)) # rows of alignment 2 +for (i in 1:ncol(x)) { + if (all(x[a1, i] == "-")) { x[a1, i] <- " " } # blank hyphens that shift + if (all(x[a2, i] == "-")) { x[a2, i] <- " " } # original alignment blocks +} + +# collapse the matrix into strings +ppa2 <- character() +for (i in 1:nrow(x)) { + ppa2[i] <- paste0(x[i, ], collapse = "") + names(ppa2)[i] <- names(ppa)[i] +} + +# add a separator line +x <- paste0(rep("-", width(ppa)[1]), collapse = "") +ppa2 <- c(ppa2[a1], x, ppa2[a2]) + +# inspect +writeALN(ppa2, range = c(800, 960)) + +# Again, go explore, and get a sense of what's going on. You may find that +# CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas +# MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would +# expect from an algorithm that builds alignments from pairwise local +# alignments, without global refinement. + + +# = 6 Sequence Logos ====================================================== + +# To visualize the information that we can get about structure and function with +# an MSA, we'll calculate a sequence logo of the Mbp1 recognition helix - the +# part of the structure that inserts into the major groove of the DNA and +# provides sequence specificity to the DNA binding of this transcription factor. +# Helix-B in Mbp1 with four residues upstream and downstream spans the sequence +# 43-ANFAKAKRTRILEKEVLKE-61 + +# == 6.1 Subsetting an alignment by motif ================================== + +# Finding the location of such an substring in an alignment is not entirely +# trivial, because the alignment might have produced indels in that sequence. +# Our strategy can be: +# - fetch the sequence from the alignment +# - remove all hyphens +# - find the range where the target sequence matches +# - count how many characters in all there are in the aligned sequence, up +# to the start and end of the match +# - these numbers define the range of the match in the alignment. + +x <- as.character(msaM)["MBP1_SACCE"] +xAA <- gsub("-", "", x) + +motif <- "ANFAKAKRTRILEKEVLKE" +(m <- regexpr(motif, xAA)) # matched in position 43, with a length of 19 +(motifStart <- as.numeric(m)) +(motifEnd <- attr(m, "match.length") + motifStart - 1) + +# To count characters, we split the string into single characters ... +x <- unlist(strsplit(x, "")) + +# ... convert this into a boolean, which is true if the character is not +# a hyphen ... +x <- x != "-" + +# ... cast this into a numeric, which turns TRUE into 1 and FALSE into 0 ... +x <- as.numeric(x) + +# ... and sum up the cumulative sum. +x <- cumsum(x) + +# Now we can find where the 43'd and 61'st characters are located in the +# alignment string ... +(aliStart <- which(x == motifStart)[1]) # get the first hit if there are more +(aliEnd <- which(x == motifEnd)[1]) + +# ... and subset the alignment + +(motifAli <- subseq(msaM@unmasked, start = aliStart, end = aliEnd)) + + +# == 6.2 Plot a Sequence Logo ============================================== + +# There are now several good options to plot sequence logos in R, these include +# dagLogo, DiffLogo, Logolas, and motifStack. For our example we will use +# ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at +# the EBI. + +if (! require(ggseqlogo, quietly=TRUE)) { + install.packages(("ggseqlogo")) + library(ggseqlogo) +} +# Package information: +# library(help=ggseqlogo) # basic information +# browseVignettes("ggseqlogo") # available vignettes +# data(package = "ggseqlogo") # available datasets + +ggseqlogo(as.character(motifAli)) + diff --git a/scripts/ABC-writeALN.R b/scripts/ABC-writeALN.R new file mode 100644 index 0000000..2a6c77b --- /dev/null +++ b/scripts/ABC-writeALN.R @@ -0,0 +1,135 @@ +# ABC-writeALN.R +# +# ToDo: calculate consensus line +# append sequence numbers +# Notes: +# +# ============================================================================== + + +writeALN <- function(ali, + range, + note = "", + myCon = stdout(), + blockWidth = 60) { + # Purpose: + # Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or + # a file in multi-FASTA format. + # Version: 2.0 + # Date: 2017 10 + # Author: Boris Steipe + # + # Parameters: + # ali MsaAAMultipleAlignment or AAStringSet or character + # vector. + # range num a two-integer vector of start and end positions if + # only a range of the MSA should be written, e.g. + # a domain. Defaults to the full alignment length. + # note chr a vector of character that is appended to the name + # of a sequence in the FASTA header. Recycling of + # shorter vectors applies, thus a vector of length one + # is added to all headers. + # myCon a connection (cf. the con argument for writeLines). + # Defaults to stdout() + # blockWidth int width of sequence block. Default 80 characters. + # Value: + # NA the function is invoked for its side effect of printing an + # alignment to stdout() or file. + + blockWidth <- as.integer(blockWidth) + if (is.na(blockWidth)) { + stop("PANIC: parameter \"blockWidth\" must be numeric.") + } + if (blockWidth < 1) { + stop("PANIC: parameter \"blockWidth\" must be greater than zero.") + } + if (blockWidth > 60) { + stop("PANIC: \"blockWidth\" for CLUSTAL format can't be greater than 60.") + } + + # Extract the raw data from the objects depending on their respective class + # and put it into a named vector of strings. + + # Extract XStringSet from MsaXMultipleAlignment ... + if (class(ali) == "MsaAAMultipleAlignment" | + class(ali) == "MsaDNAMultipleAlignment" | + class(ali) == "MsaRNAMultipleAlignment") { + ali <- ali@unmasked + } + + # Process XStringSet + if (class(ali) == "AAStringSet" | + class(ali) == "DNAStringSet" | + class(ali) == "RNAStringSet") { + sSet <- as.character(ali) # we use as.character(), not toString() thus + # we don't _have_ to load Biostrings + } else if (class(ali) == "character") { + sSet <- ali + } else { + stop(paste("Input object of class", + class(ali), + "can't be handled by this function.")) + } + + if (missing(range)) { + range <- 1 + range[2] <- max(nchar(sSet)) + } else { + range <- as.integer(range) + if(length(range) != 2 || + any(is.na(range)) || + range[1] > range[2] || + range[1] < 1) { + stop("PANIC: \"range\" parameter must contain valid start and end index.") + } + } + + # Right-pad any sequence with "-" that is shorter than ranges[2] + for (i in seq_along(sSet)) { + if (nchar(sSet[i]) < range[2]) { + sSet[i] <- paste0(sSet[i], + paste0(rep("-", range[2] - nchar(sSet[i])), + collapse = "")) + } + } + + # Right-pad sequence names + sNames <- names(sSet) + len <- max(nchar(sNames)) + 2 # longest name plus two spaces + for (i in seq_along(sNames)) { + sNames[i] <- paste0(sNames[i], + paste0(rep(" ", len - nchar(sNames[i])), + collapse = "")) + } + + + # Process each sequence + txt <- paste0("CLUSTAL W format. ", note) + txt[2] <- "" + + iStarts <- seq(range[1], range[2], by = blockWidth) + iEnds <- c((iStarts[-1] - 1), range[2]) + + for (i in seq_along(iStarts)) { + for (j in seq_along(sSet)) { + txt <- c(txt, + paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i]))) + } + txt <- c(txt, "") # append a blank consenus line + txt <- c(txt, "") # append a separator line + } + + writeLines(txt, con= myCon) + +} + +# ==== TESTS ================================================================= +# Enter your function tests here... + +if (FALSE) { + # test ... +} + + + +# [END] diff --git a/scripts/ABC-writeMFA.R b/scripts/ABC-writeMFA.R new file mode 100644 index 0000000..1599912 --- /dev/null +++ b/scripts/ABC-writeMFA.R @@ -0,0 +1,115 @@ +# ABC-writeMFA.R +# +# ToDo: +# Notes: +# +# ============================================================================== + + +writeMFA <- function(ali, + range, + note = "", + myCon = stdout(), + blockWidth = 80) { + # Purpose: + # Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or + # a file in multi-FASTA format. + # Version: 2.0 + # Date: 2017 10 + # Author: Boris Steipe + # + # Parameters: + # ali MsaAAMultipleAlignment or AAStringSet or character + # vector + # range num a two-integer vector of start and end positions if + # only a range of the MSA should be written, e.g. + # a domain. Defaults to the full sequence length. + # note chr a vector of character that is appended to the name + # of a sequence in the FASTA header. Recycling of + # shorter vectors applies, thus a vector of length one + # is added to all headers. + # myCon a connection (cf. the con argument for writeLines). + # Defaults to stdout() + # blockWidth int width of sequence block. Default 80 characters. + # Value: + # NA the function is invoked for its side effect of printing an + # alignment to stdout() or file. + + blockWidth <- as.integer(blockWidth) + if (is.na(blockWidth)) { + stop("PANIC: parameter \"blockWidth\" must be numeric.") + } + if (blockWidth < 1){ + stop("PANIC: parameter \"blockWidth\" must be greater than zero.") + } + + # Extract the raw data from the objects depending on their respective class + # and put it into a named vector of strings. + + # Extract XStringSet from MsaXMultipleAlignment ... + if (class(ali) == "MsaAAMultipleAlignment" | + class(ali) == "MsaDNAMultipleAlignment" | + class(ali) == "MsaRNAMultipleAlignment") { + ali <- ali@unmasked + } + + # Process XStringSet + if (class(ali) == "AAStringSet" | + class(ali) == "DNAStringSet" | + class(ali) == "RNAStringSet") { + sSet <- as.character(ali) # we use as.character(), not toString() thus + # we don't _have_ to load Biostrings + } else if (class(ali) == "character") { + sSet <- ali + } else { + stop(paste("Input object of class", + class(ali), + "can't be handled by this function.")) + } + + if (missing(range)) { + range <- 1 + range[2] <- max(nchar(sSet)) + } else { + range <- as.integer(range) + if(length(range) != 2 || + any(is.na(range)) || + range[1] > range[2] || + range[1] < 1) { + stop("PANIC: \"range\" parameter must contain valid start and end index.") + } + } + + # Process each sequence + txt <- character() + headers <- paste(names(sSet), note) + for (i in seq_along(sSet)) { + + # output FASTA header + txt <- c(txt, sprintf(">%s", headers[i])) + + # output the sequence in blocks of blockWidth per line ... + iStarts <- seq(range[1], range[2], by = blockWidth) + iEnds <- c((iStarts[-1] - 1), range[2]) + + thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks + thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks + txt <- c(txt, thisSeq) + + txt <- c(txt, "") # append an empty line for readability + } + + writeLines(txt, con= myCon) + +} + +# ==== TESTS ================================================================= +# Enter your function tests here... + +if (FALSE) { + # test ... +} + + + +# [END]