Line termination change and old code.
This commit is contained in:
		
							
								
								
									
										258
									
								
								.Rprofile
									
									
									
									
									
								
							
							
						
						
									
										258
									
								
								.Rprofile
									
									
									
									
									
								
							| @@ -1,129 +1,129 @@ | ||||
| # .Rprofile | ||||
| # | ||||
| # This script is automatically executed on startup | ||||
| # ============================================================================== | ||||
|  | ||||
| init <- function() { | ||||
|  | ||||
|   # Create a local copy of myScript.R if not done yet. | ||||
|   if (! file.exists("myScript.R") && file.exists(".tmp.R")) { | ||||
|     file.copy(".tmp.R", "myScript.R") | ||||
|     cat("A new file \"myScript.R\" was created. You can use it for\n") | ||||
|     cat("notes and code experiments.\n\n") | ||||
|   } | ||||
|  | ||||
|   cat("\n\n") | ||||
|   cat("Please open the file \".myProfile.R\" (click on the file-name in the\n") | ||||
|   cat("\"files\" pane), edit it and save it.\n") | ||||
|   cat("Then click the checkbox, and use the More -> Move... dialogue\n") | ||||
|   cat("to move it into the \"myScripts\" folder.\n\n") | ||||
|  | ||||
|   file.edit("ABC-units.R") | ||||
|   return(invisible(NULL)) | ||||
| } | ||||
|  | ||||
| if (! file.exists("./myScripts/.myProfile.R")) { | ||||
|   cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n") | ||||
|   cat("    =================") | ||||
|   cat("\n\n") | ||||
|   cat("        WELCOME !\n") | ||||
|   cat("\n") | ||||
|   cat("  Type  'init()'  to begin\n\n") | ||||
|   cat("\n") | ||||
|   cat("    =================") | ||||
|   cat("\n\n") | ||||
|  | ||||
| } else {  # local profile exists ... validate state: | ||||
|   cat("\n\nLoading local functions ...") | ||||
|  | ||||
|   source(".utilities.R")  # local profile appears sane, source utilities | ||||
|   source("./myScripts/.myProfile.R") | ||||
|  | ||||
|   if (! exists("myEMail")) {  # ... has eMail been defined? | ||||
|     cat("ERROR !\n") | ||||
|     cat("=======\n") | ||||
|     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") | ||||
|     cat("the variable \"myEMail\" was not loaded.\n") | ||||
|     cat("Please contact your instructor to continue.\n\n") | ||||
|   } | ||||
|   if (! exists("myStudentNumber")) {  # ... has the Student Number been defined? | ||||
|     cat("ERROR !\n") | ||||
|     cat("=======\n") | ||||
|     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") | ||||
|     cat("the variable \"myStudentNumber\" was not loaded.\n") | ||||
|     cat("Please contact your instructor to continue.\n\n") | ||||
|   } | ||||
|   if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) { | ||||
|     cat("ERROR !\n")                 # is the Student Number valid? | ||||
|     cat("=======\n") | ||||
|     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") | ||||
|     cat("your Student Number could not be validated.\n") | ||||
|     cat("Please examine the file \"./myScripts/.myProfile.R\"\n") | ||||
|     cat(" and fix the problem or contact your instructor to continue.\n\n") | ||||
|   } | ||||
|  | ||||
|  | ||||
|   if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now | ||||
|                             # ... and write it into the profile. | ||||
|        prf <- readLines("./myScripts/.myProfile.R") | ||||
|        iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf) | ||||
|        out <- prf[1:iEmail] | ||||
|        out <- c(out, sprintf("MYSPE <- \"%s\" ", | ||||
|                              getMYSPE(myStudentNumber))) | ||||
|        out <- c(out, prf[(iEmail+1):length(prf)]) | ||||
|        writeLines(out, "./myScripts/.myProfile.R") | ||||
|  | ||||
|        cat("\n") | ||||
|        cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n", | ||||
|                    getMYSPE(myStudentNumber))) | ||||
|        MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use | ||||
|        rm(prf, iEmail, out)                # cleanup | ||||
|   } | ||||
|   cat("... done.\n\n") | ||||
| } | ||||
|  | ||||
| if (default.stringsAsFactors()) { | ||||
|   cat("WARNING.\n") | ||||
|   cat("========\n") | ||||
|   cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n") | ||||
|   cat("This will break some of the code.\n") | ||||
|   cat("Please contact your instructor to troubleshoot and fix this issue.\n") | ||||
|   cat("\n") | ||||
| } | ||||
|  | ||||
| errText <- list() | ||||
| errText[["noProfileFile"]] <- ' | ||||
| Your PROFILE FILE does not exist. This problem must be fixed to continue. | ||||
|  | ||||
|   The code expects the file "./myScripts/.myProfile.R" to exist and to | ||||
|   contain your correct eMail address and student number. Detailed | ||||
|   instructions were given when you first ran the init() command. | ||||
|  | ||||
|   Try running init() again and follow the instructions. Reload youR RStudio | ||||
|   session and start over with this file. | ||||
|  | ||||
|   If this does not fix the problem, ask for help. | ||||
| ' | ||||
|  | ||||
| errText[["noStudentNumber"]] <- ' | ||||
| Your STUDENT NUMBER has not been defined. This problem must be fixed to continue. | ||||
|  | ||||
|   The code expects the file "./myScripts/.myProfile.R" to exist and to | ||||
|   contain your correct eMail address and student number. This file gets | ||||
|   sourced when you start a new R-session, but since you see this error | ||||
|   message there was a problem. | ||||
|  | ||||
|   Perhaps you need to restart your R-session. Try closing the RStudio | ||||
|   project and reopening it from the File > Recent Projects menu. | ||||
|  | ||||
|   Perhaps there was a syntax error in your file. Then not all the | ||||
|   instructions in the file are executed. Check the file: is your | ||||
|   email perhpas not defined? Or did you type it without qwuoataion | ||||
|   marks? | ||||
|  | ||||
|   Try fixing problems, and then restart R as described above. | ||||
|  | ||||
|   If none of this fixes the problem, ask for help. | ||||
| ' | ||||
|  | ||||
| # [END] | ||||
| # .Rprofile | ||||
| # | ||||
| # This script is automatically executed on startup | ||||
| # ============================================================================== | ||||
|  | ||||
| init <- function() { | ||||
|  | ||||
|   # Create a local copy of myScript.R if not done yet. | ||||
|   if (! file.exists("myScript.R") && file.exists(".tmp.R")) { | ||||
|     file.copy(".tmp.R", "myScript.R") | ||||
|     cat("A new file \"myScript.R\" was created. You can use it for\n") | ||||
|     cat("notes and code experiments.\n\n") | ||||
|   } | ||||
|  | ||||
|   cat("\n\n") | ||||
|   cat("Please open the file \".myProfile.R\" (click on the file-name in the\n") | ||||
|   cat("\"files\" pane), edit it and save it.\n") | ||||
|   cat("Then click the checkbox, and use the More -> Move... dialogue\n") | ||||
|   cat("to move it into the \"myScripts\" folder.\n\n") | ||||
|  | ||||
|   file.edit("ABC-units.R") | ||||
|   return(invisible(NULL)) | ||||
| } | ||||
|  | ||||
| if (! file.exists("./myScripts/.myProfile.R")) { | ||||
|   cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n") | ||||
|   cat("    =================") | ||||
|   cat("\n\n") | ||||
|   cat("        WELCOME !\n") | ||||
|   cat("\n") | ||||
|   cat("  Type  'init()'  to begin\n\n") | ||||
|   cat("\n") | ||||
|   cat("    =================") | ||||
|   cat("\n\n") | ||||
|  | ||||
| } else {  # local profile exists ... validate state: | ||||
|   cat("\n\nLoading local functions ...") | ||||
|  | ||||
|   source(".utilities.R")  # local profile appears sane, source utilities | ||||
|   source("./myScripts/.myProfile.R") | ||||
|  | ||||
|   if (! exists("myEMail")) {  # ... has eMail been defined? | ||||
|     cat("ERROR !\n") | ||||
|     cat("=======\n") | ||||
|     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") | ||||
|     cat("the variable \"myEMail\" was not loaded.\n") | ||||
|     cat("Please contact your instructor to continue.\n\n") | ||||
|   } | ||||
|   if (! exists("myStudentNumber")) {  # ... has the Student Number been defined? | ||||
|     cat("ERROR !\n") | ||||
|     cat("=======\n") | ||||
|     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") | ||||
|     cat("the variable \"myStudentNumber\" was not loaded.\n") | ||||
|     cat("Please contact your instructor to continue.\n\n") | ||||
|   } | ||||
|   if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) { | ||||
|     cat("ERROR !\n")                 # is the Student Number valid? | ||||
|     cat("=======\n") | ||||
|     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") | ||||
|     cat("your Student Number could not be validated.\n") | ||||
|     cat("Please examine the file \"./myScripts/.myProfile.R\"\n") | ||||
|     cat(" and fix the problem or contact your instructor to continue.\n\n") | ||||
|   } | ||||
|  | ||||
|  | ||||
|   if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now | ||||
|                             # ... and write it into the profile. | ||||
|        prf <- readLines("./myScripts/.myProfile.R") | ||||
|        iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf) | ||||
|        out <- prf[1:iEmail] | ||||
|        out <- c(out, sprintf("MYSPE <- \"%s\" ", | ||||
|                              getMYSPE(myStudentNumber))) | ||||
|        out <- c(out, prf[(iEmail+1):length(prf)]) | ||||
|        writeLines(out, "./myScripts/.myProfile.R") | ||||
|  | ||||
|        cat("\n") | ||||
|        cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n", | ||||
|                    getMYSPE(myStudentNumber))) | ||||
|        MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use | ||||
|        rm(prf, iEmail, out)                # cleanup | ||||
|   } | ||||
|   cat("... done.\n\n") | ||||
| } | ||||
|  | ||||
| if (default.stringsAsFactors()) { | ||||
|   cat("WARNING.\n") | ||||
|   cat("========\n") | ||||
|   cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n") | ||||
|   cat("This will break some of the code.\n") | ||||
|   cat("Please contact your instructor to troubleshoot and fix this issue.\n") | ||||
|   cat("\n") | ||||
| } | ||||
|  | ||||
| errText <- list() | ||||
| errText[["noProfileFile"]] <- ' | ||||
| Your PROFILE FILE does not exist. This problem must be fixed to continue. | ||||
|  | ||||
|   The code expects the file "./myScripts/.myProfile.R" to exist and to | ||||
|   contain your correct eMail address and student number. Detailed | ||||
|   instructions were given when you first ran the init() command. | ||||
|  | ||||
|   Try running init() again and follow the instructions. Reload youR RStudio | ||||
|   session and start over with this file. | ||||
|  | ||||
|   If this does not fix the problem, ask for help. | ||||
| ' | ||||
|  | ||||
| errText[["noStudentNumber"]] <- ' | ||||
| Your STUDENT NUMBER has not been defined. This problem must be fixed to continue. | ||||
|  | ||||
|   The code expects the file "./myScripts/.myProfile.R" to exist and to | ||||
|   contain your correct eMail address and student number. This file gets | ||||
|   sourced when you start a new R-session, but since you see this error | ||||
|   message there was a problem. | ||||
|  | ||||
|   Perhaps you need to restart your R-session. Try closing the RStudio | ||||
|   project and reopening it from the File > Recent Projects menu. | ||||
|  | ||||
|   Perhaps there was a syntax error in your file. Then not all the | ||||
|   instructions in the file are executed. Check the file: is your | ||||
|   email perhpas not defined? Or did you type it without qwuoataion | ||||
|   marks? | ||||
|  | ||||
|   Try fixing problems, and then restart R as described above. | ||||
|  | ||||
|   If none of this fixes the problem, ask for help. | ||||
| ' | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										88
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										88
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,44 +1,44 @@ | ||||
| # Miscellaneous | ||||
| .Ds_store | ||||
| instructor/ | ||||
| dev/ | ||||
| # myScripts/ # We don't want to ignore this so we can save our work to our own fork. | ||||
|  | ||||
| # History files | ||||
| .Rhistory | ||||
| .Rapp.history | ||||
|  | ||||
| # Session Data files | ||||
| # .RData | ||||
|  | ||||
| # Files produced in assingments | ||||
| data/APSESphyloSet.mfa | ||||
| data/APSEStreeRproml.rds | ||||
|  | ||||
| # Example code in package build process | ||||
| *-Ex.R | ||||
|  | ||||
| # Output files from R CMD build | ||||
| /*.tar.gz | ||||
|  | ||||
| # Output files from R CMD check | ||||
| /*.Rcheck/ | ||||
|  | ||||
| # RStudio files | ||||
| .Rproj.user/ | ||||
|  | ||||
| # produced vignettes | ||||
| vignettes/*.html | ||||
| vignettes/*.pdf | ||||
|  | ||||
| # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 | ||||
| .httr-oauth | ||||
|  | ||||
| # knitr and R markdown default cache directories | ||||
| /*_cache/ | ||||
| /cache/ | ||||
|  | ||||
| # Temporary files created by R markdown | ||||
| *.utf8.md | ||||
| *.knit.md | ||||
| .Rproj.user | ||||
| # Miscellaneous | ||||
| .Ds_store | ||||
| instructor/ | ||||
| dev/ | ||||
| # myScripts/ # We don't want to ignore this so we can save our work to our own fork. | ||||
|  | ||||
| # History files | ||||
| .Rhistory | ||||
| .Rapp.history | ||||
|  | ||||
| # Session Data files | ||||
| # .RData | ||||
|  | ||||
| # Files produced in assingments | ||||
| data/APSESphyloSet.mfa | ||||
| data/APSEStreeRproml.rds | ||||
|  | ||||
| # Example code in package build process | ||||
| *-Ex.R | ||||
|  | ||||
| # Output files from R CMD build | ||||
| /*.tar.gz | ||||
|  | ||||
| # Output files from R CMD check | ||||
| /*.Rcheck/ | ||||
|  | ||||
| # RStudio files | ||||
| .Rproj.user/ | ||||
|  | ||||
| # produced vignettes | ||||
| vignettes/*.html | ||||
| vignettes/*.pdf | ||||
|  | ||||
| # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 | ||||
| .httr-oauth | ||||
|  | ||||
| # knitr and R markdown default cache directories | ||||
| /*_cache/ | ||||
| /cache/ | ||||
|  | ||||
| # Temporary files created by R markdown | ||||
| *.utf8.md | ||||
| *.knit.md | ||||
| .Rproj.user | ||||
|   | ||||
							
								
								
									
										76
									
								
								.tmp.R
									
									
									
									
									
								
							
							
						
						
									
										76
									
								
								.tmp.R
									
									
									
									
									
								
							| @@ -1,38 +1,38 @@ | ||||
| # myScript.R | ||||
| # | ||||
| # --- As you work with this file, you can delete the instructions below -------- | ||||
| # Write your notes and code experiments into this document. Save it | ||||
| # from time to time - however I recommend that you do not _commit_ | ||||
| # your saved version. | ||||
| # | ||||
| # As long as you do not _commit_ this script to version control, | ||||
| # you can _pull_ updated versions of the entire project from GitHub | ||||
| # by using the RStudio version control interface. However, once | ||||
| # you _commit_ any file in your local version, RStudio will require | ||||
| # you to resolve conflicts before you can _pull_ updates. | ||||
| # --- As you work with this file, you can delete the instructions above -------- | ||||
| # | ||||
| ## Purpose: <...> | ||||
| # | ||||
| # Version: <...> | ||||
| # | ||||
| # Date:    <...> | ||||
| # Author:  <Name> (<namee@mail.utoronto.ca>) | ||||
| # | ||||
| # Versions: | ||||
| # | ||||
| #   <number>    <Features> | ||||
| # | ||||
| # TODO: | ||||
| #   <...> | ||||
| # | ||||
| # ==================================================================== | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|  | ||||
| # myScript.R | ||||
| # | ||||
| # --- As you work with this file, you can delete the instructions below -------- | ||||
| # Write your notes and code experiments into this document. Save it | ||||
| # from time to time - however I recommend that you do not _commit_ | ||||
| # your saved version. | ||||
| # | ||||
| # As long as you do not _commit_ this script to version control, | ||||
| # you can _pull_ updated versions of the entire project from GitHub | ||||
| # by using the RStudio version control interface. However, once | ||||
| # you _commit_ any file in your local version, RStudio will require | ||||
| # you to resolve conflicts before you can _pull_ updates. | ||||
| # --- As you work with this file, you can delete the instructions above -------- | ||||
| # | ||||
| ## Purpose: <...> | ||||
| # | ||||
| # Version: <...> | ||||
| # | ||||
| # Date:    <...> | ||||
| # Author:  <Name> (<namee@mail.utoronto.ca>) | ||||
| # | ||||
| # Versions: | ||||
| # | ||||
| #   <number>    <Features> | ||||
| # | ||||
| # TODO: | ||||
| #   <...> | ||||
| # | ||||
| # ==================================================================== | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|  | ||||
|   | ||||
							
								
								
									
										1308
									
								
								.utilities.R
									
									
									
									
									
								
							
							
						
						
									
										1308
									
								
								.utilities.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,257 +1,257 @@ | ||||
| # 2021-10-12_In-Class_exploration.R | ||||
| # | ||||
| #         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D ===== | ||||
| # | ||||
| # Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12 | ||||
| # Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu | ||||
| # Scribe:     boris.steipe@utoronto.ca | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # In our last session we explored some properties of amino acids and noted that | ||||
| # we can arrange them in a scatter-plot according to some properties. But can | ||||
| # we also arrange them according to generic properties, i.e. taking all | ||||
| # published property scales into account? We will try to use all tables from | ||||
| # the seqinr package. | ||||
|  | ||||
| # First we load the package - this makes all datasets immediately available and | ||||
| # we don't have to load them one by one. | ||||
|  | ||||
| library(seqinr) | ||||
|  | ||||
| # Determine what datasets are available | ||||
| # | ||||
| # Using "find in topic" ... "amino acid" | ||||
| data(aacost) | ||||
| data(aaindex) | ||||
| data(pK) | ||||
|  | ||||
| # We note that datasets may be sorted in different ways: for example | ||||
| # alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala, | ||||
| # Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino | ||||
| # acids are sorted in the same way. | ||||
|  | ||||
| # Build a datastructure ... | ||||
| # rows: amino acids | ||||
| # columns: properties | ||||
|  | ||||
| # Are all lists in aaindex organized in the same way? | ||||
|  | ||||
| refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item | ||||
|                                   # index as a reference list | ||||
|  | ||||
| # Loop over each list in aaindex | ||||
| for (i in 1:length(aaindex)) { | ||||
| #   get the I-vector | ||||
|   x <- aaindex[[i]]$I | ||||
| #   get the names | ||||
|   x <- names(x) | ||||
| #   compare with the names of our reference list | ||||
| #   the == and != operators are vectorized. Applying them to two vectors | ||||
| #   gives TRUE or FALSE for each pair of elements. any() or all() can be | ||||
| #   applied to logical vectors to anylise them and return a soingle result. | ||||
| #   if (...) conditions evaluate only a single value and will throw a warning if | ||||
| #   there is more than one. | ||||
|  | ||||
|   if (any(x != refNames)) { | ||||
|     # There was at least one not-equal pair - so: complain | ||||
|     print(sprintf("Problem in list %d: names don't match", i)) | ||||
|   } | ||||
| } | ||||
|  | ||||
| # If we get here without identifying problems, it means all pairs of | ||||
| # rownames match throughout the aainfex list. | ||||
|  | ||||
|  | ||||
| # Next: what is the cvorrect syntax to add one vector (the "I" vector of | ||||
| # one of the list elements) to our dataframe? | ||||
| aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index | ||||
| aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index | ||||
|  | ||||
| str(aaData)  # Confirm: we now have a two-column dataframe | ||||
|  | ||||
| # Next: add the rest ... | ||||
| for (i in 3:length(aaindex)) { | ||||
|   #   get the I-vector and write it into our dataframe | ||||
|   aaData[,i] <- aaindex[[i]]$I | ||||
| } | ||||
|  | ||||
| # Sanity check | ||||
| plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other | ||||
|  | ||||
| # Looks good. | ||||
|  | ||||
| # We finished building our data structure ... but let's add the aacost table | ||||
| # aacost is ordered differently: | ||||
| rownames(aaData) | ||||
| aacost[ , 1] | ||||
|  | ||||
| # using order(), applied to aacost - ordering the column with column-name | ||||
| # "aaa" | ||||
| sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes | ||||
| aacost[sel, "aaa"] # applying the order vector sorts the column | ||||
|  | ||||
| # Is this the same order as refNames? | ||||
| refNames == aacost[sel, "aaa"]  # Yes! | ||||
|  | ||||
| # add the data from column "tot" (i.e. total metabolic cost) after the | ||||
| # last column of aaData | ||||
| aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"] | ||||
|  | ||||
| # Done. | ||||
| str(aaData)  # A dataframe with 20 rows and 545 columns | ||||
|  | ||||
| # To answer the question "Which amino acids are similar to each other?" we | ||||
| # need to reduce this 545-dimensional dataset to fewer dimensions, otherwise | ||||
| # we will succumb to the "Curse of Dimensionality": | ||||
| # | ||||
| #    "in high dimensional data, however, all objects appear | ||||
| #     to be sparse and dissimilar in many ways..." | ||||
| #                   https://en.wikipedia.org/wiki/Curse_of_dimensionality | ||||
| # | ||||
| # A classic way to do this is Principal Component Analysis (PCA) ... | ||||
| # (Principal components analysis) | ||||
| # | ||||
| # PCA expects objects in columns, properties in rows. Therefore we need to | ||||
| # transpose our dataset: | ||||
|  | ||||
| aaPCA <- prcomp(t(aaData)) | ||||
|  | ||||
| # This creates an error, because some of our indicews contain NA values! | ||||
| # Which indices are this? | ||||
|  | ||||
| # We create a vector "sel" for which we check whether any element in each | ||||
| # column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can | ||||
| # then use this vector to subset ourt dataframe. | ||||
|  | ||||
| sel <- logical() | ||||
|  | ||||
| for (i in 1:ncol(aaData)) {         # for each index | ||||
|   if (any(is.na(aaData[,i]))) {     #   if there is any NA value ... | ||||
|     sel <- c(sel, FALSE)            #     add a FALSE element to the vector | ||||
|   } else {                          #   else | ||||
|     sel <- c(sel, TRUE)             #     add a TRUE element | ||||
|   } | ||||
| } | ||||
|  | ||||
| # Done. sel now subsets only the NA-free columns | ||||
| 545 - sum(sel)                      # 13 columns excluded | ||||
|  | ||||
| # Do the PCA ... use the prcomp() function | ||||
| aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set | ||||
|  | ||||
| str(aaPCA)   # structure of the result | ||||
|  | ||||
| plot(aaPCA)                         # plot the contributions of the | ||||
|                                     # components to the variance | ||||
|  | ||||
| plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC | ||||
|      aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame | ||||
|      type ="n")                     # just to set up the coordinate system | ||||
|  | ||||
| text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into | ||||
|      aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions | ||||
|      labels = rownames(aaPCA$rotation)) | ||||
|  | ||||
| # PCA results are sensitive to the absolute numeric value of the features that | ||||
| # we are comparing. The prcomp() function has an option scale. = TRUE that | ||||
| # scales each row of features so that the variance of the value is 1.0  This | ||||
| # ensures that each feature is given approximately equal weight | ||||
|  | ||||
| aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE) | ||||
|  | ||||
| plot(aaPCA) | ||||
|  | ||||
| plot(aaPCA$rotation[ , 1], | ||||
|      aaPCA$rotation[ , 2], | ||||
|      type ="n") | ||||
| text(aaPCA$rotation[ , 1], | ||||
|      aaPCA$rotation[ , 2], | ||||
|      labels = rownames(aaPCA$rotation)) | ||||
|  | ||||
|  | ||||
| # Next we try to identify what the PCs correspond to. We see whether there are | ||||
| # specific features that are highly correlated with the PCs | ||||
|  | ||||
| # ==== Rotation 1 =================== | ||||
| # | ||||
|  | ||||
| (PC1 <- aaPCA$rotation[ , 1])  # Assign PC1 | ||||
|  | ||||
| # The function cor() calculates Pearson coefficients of correlation | ||||
| cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37 | ||||
|  | ||||
|  | ||||
| # Iterate over all columns and calculate correlations | ||||
| cors <- numeric() | ||||
|  | ||||
| for (i in 1:ncol(aaData)) { | ||||
|   cors[i] <- cor(PC1, aaData[ , i]) | ||||
| } | ||||
|  | ||||
| summary(cors) | ||||
| #    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's | ||||
| # -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13 | ||||
| # | ||||
| #  The max correlation is ~0.6. That is not very high. Which ijndex is it? | ||||
|  | ||||
| which(cors == max(cors, na.rm = TRUE)) | ||||
|  | ||||
| aaindex[[504]]   # Linker propensity ??? | ||||
|  | ||||
| cor(PC1, aaindex[[504]]$I) # Did we get the right index? | ||||
|  | ||||
| # Plot this ... | ||||
| plot(aaPCA$rotation[ , 1], | ||||
|      aaindex[[504]]$I, | ||||
|      type ="n") | ||||
| text(aaPCA$rotation[ , 1], | ||||
|      aaindex[[504]]$I, | ||||
|      labels = rownames(aaPCA$rotation)) | ||||
|  | ||||
| # This is essentially a random correlation but for Cysteine ... | ||||
|  | ||||
|  | ||||
| # ==== Rotation 2 =================== | ||||
| # | ||||
| # same process | ||||
| PC2 <- aaPCA$rotation[ , 2] | ||||
|  | ||||
| cors2 <- numeric() | ||||
|  | ||||
| for (i in 1:ncol(aaData)) { | ||||
|   cors2[i] <- cor(PC2, aaData[ , i]) | ||||
| } | ||||
|  | ||||
| summary(cors2) | ||||
| #     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's | ||||
| # -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13 | ||||
|  | ||||
| # Here we have quite strong correlations | ||||
|  | ||||
| which(cors2 == max(cors2, na.rm = TRUE)) | ||||
|  | ||||
| aaindex[[148]] | ||||
|  | ||||
| # this index itself is correlated with many other indices | ||||
|  | ||||
| cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index | ||||
|  | ||||
| # Plot this too... | ||||
| plot(aaPCA$rotation[ , 2], | ||||
|      aaindex[[148]]$I, | ||||
|      type ="n") | ||||
| text(aaPCA$rotation[ , 2], | ||||
|      aaindex[[148]]$I, | ||||
|      labels = rownames(aaPCA$rotation)) | ||||
|  | ||||
| # This correlates well with hydrophobicity measures. In this case the | ||||
| # PC is to a certain degree interpretable - but this is not always the case | ||||
| # with PCA (see the example of the first PC). | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # 2021-10-12_In-Class_exploration.R | ||||
| # | ||||
| #         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D ===== | ||||
| # | ||||
| # Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12 | ||||
| # Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu | ||||
| # Scribe:     boris.steipe@utoronto.ca | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # In our last session we explored some properties of amino acids and noted that | ||||
| # we can arrange them in a scatter-plot according to some properties. But can | ||||
| # we also arrange them according to generic properties, i.e. taking all | ||||
| # published property scales into account? We will try to use all tables from | ||||
| # the seqinr package. | ||||
|  | ||||
| # First we load the package - this makes all datasets immediately available and | ||||
| # we don't have to load them one by one. | ||||
|  | ||||
| library(seqinr) | ||||
|  | ||||
| # Determine what datasets are available | ||||
| # | ||||
| # Using "find in topic" ... "amino acid" | ||||
| data(aacost) | ||||
| data(aaindex) | ||||
| data(pK) | ||||
|  | ||||
| # We note that datasets may be sorted in different ways: for example | ||||
| # alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala, | ||||
| # Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino | ||||
| # acids are sorted in the same way. | ||||
|  | ||||
| # Build a datastructure ... | ||||
| # rows: amino acids | ||||
| # columns: properties | ||||
|  | ||||
| # Are all lists in aaindex organized in the same way? | ||||
|  | ||||
| refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item | ||||
|                                   # index as a reference list | ||||
|  | ||||
| # Loop over each list in aaindex | ||||
| for (i in 1:length(aaindex)) { | ||||
| #   get the I-vector | ||||
|   x <- aaindex[[i]]$I | ||||
| #   get the names | ||||
|   x <- names(x) | ||||
| #   compare with the names of our reference list | ||||
| #   the == and != operators are vectorized. Applying them to two vectors | ||||
| #   gives TRUE or FALSE for each pair of elements. any() or all() can be | ||||
| #   applied to logical vectors to anylise them and return a soingle result. | ||||
| #   if (...) conditions evaluate only a single value and will throw a warning if | ||||
| #   there is more than one. | ||||
|  | ||||
|   if (any(x != refNames)) { | ||||
|     # There was at least one not-equal pair - so: complain | ||||
|     print(sprintf("Problem in list %d: names don't match", i)) | ||||
|   } | ||||
| } | ||||
|  | ||||
| # If we get here without identifying problems, it means all pairs of | ||||
| # rownames match throughout the aainfex list. | ||||
|  | ||||
|  | ||||
| # Next: what is the cvorrect syntax to add one vector (the "I" vector of | ||||
| # one of the list elements) to our dataframe? | ||||
| aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index | ||||
| aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index | ||||
|  | ||||
| str(aaData)  # Confirm: we now have a two-column dataframe | ||||
|  | ||||
| # Next: add the rest ... | ||||
| for (i in 3:length(aaindex)) { | ||||
|   #   get the I-vector and write it into our dataframe | ||||
|   aaData[,i] <- aaindex[[i]]$I | ||||
| } | ||||
|  | ||||
| # Sanity check | ||||
| plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other | ||||
|  | ||||
| # Looks good. | ||||
|  | ||||
| # We finished building our data structure ... but let's add the aacost table | ||||
| # aacost is ordered differently: | ||||
| rownames(aaData) | ||||
| aacost[ , 1] | ||||
|  | ||||
| # using order(), applied to aacost - ordering the column with column-name | ||||
| # "aaa" | ||||
| sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes | ||||
| aacost[sel, "aaa"] # applying the order vector sorts the column | ||||
|  | ||||
| # Is this the same order as refNames? | ||||
| refNames == aacost[sel, "aaa"]  # Yes! | ||||
|  | ||||
| # add the data from column "tot" (i.e. total metabolic cost) after the | ||||
| # last column of aaData | ||||
| aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"] | ||||
|  | ||||
| # Done. | ||||
| str(aaData)  # A dataframe with 20 rows and 545 columns | ||||
|  | ||||
| # To answer the question "Which amino acids are similar to each other?" we | ||||
| # need to reduce this 545-dimensional dataset to fewer dimensions, otherwise | ||||
| # we will succumb to the "Curse of Dimensionality": | ||||
| # | ||||
| #    "in high dimensional data, however, all objects appear | ||||
| #     to be sparse and dissimilar in many ways..." | ||||
| #                   https://en.wikipedia.org/wiki/Curse_of_dimensionality | ||||
| # | ||||
| # A classic way to do this is Principal Component Analysis (PCA) ... | ||||
| # (Principal components analysis) | ||||
| # | ||||
| # PCA expects objects in columns, properties in rows. Therefore we need to | ||||
| # transpose our dataset: | ||||
|  | ||||
| aaPCA <- prcomp(t(aaData)) | ||||
|  | ||||
| # This creates an error, because some of our indicews contain NA values! | ||||
| # Which indices are this? | ||||
|  | ||||
| # We create a vector "sel" for which we check whether any element in each | ||||
| # column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can | ||||
| # then use this vector to subset ourt dataframe. | ||||
|  | ||||
| sel <- logical() | ||||
|  | ||||
| for (i in 1:ncol(aaData)) {         # for each index | ||||
|   if (any(is.na(aaData[,i]))) {     #   if there is any NA value ... | ||||
|     sel <- c(sel, FALSE)            #     add a FALSE element to the vector | ||||
|   } else {                          #   else | ||||
|     sel <- c(sel, TRUE)             #     add a TRUE element | ||||
|   } | ||||
| } | ||||
|  | ||||
| # Done. sel now subsets only the NA-free columns | ||||
| 545 - sum(sel)                      # 13 columns excluded | ||||
|  | ||||
| # Do the PCA ... use the prcomp() function | ||||
| aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set | ||||
|  | ||||
| str(aaPCA)   # structure of the result | ||||
|  | ||||
| plot(aaPCA)                         # plot the contributions of the | ||||
|                                     # components to the variance | ||||
|  | ||||
| plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC | ||||
|      aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame | ||||
|      type ="n")                     # just to set up the coordinate system | ||||
|  | ||||
| text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into | ||||
|      aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions | ||||
|      labels = rownames(aaPCA$rotation)) | ||||
|  | ||||
| # PCA results are sensitive to the absolute numeric value of the features that | ||||
| # we are comparing. The prcomp() function has an option scale. = TRUE that | ||||
| # scales each row of features so that the variance of the value is 1.0  This | ||||
| # ensures that each feature is given approximately equal weight | ||||
|  | ||||
| aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE) | ||||
|  | ||||
| plot(aaPCA) | ||||
|  | ||||
| plot(aaPCA$rotation[ , 1], | ||||
|      aaPCA$rotation[ , 2], | ||||
|      type ="n") | ||||
| text(aaPCA$rotation[ , 1], | ||||
|      aaPCA$rotation[ , 2], | ||||
|      labels = rownames(aaPCA$rotation)) | ||||
|  | ||||
|  | ||||
| # Next we try to identify what the PCs correspond to. We see whether there are | ||||
| # specific features that are highly correlated with the PCs | ||||
|  | ||||
| # ==== Rotation 1 =================== | ||||
| # | ||||
|  | ||||
| (PC1 <- aaPCA$rotation[ , 1])  # Assign PC1 | ||||
|  | ||||
| # The function cor() calculates Pearson coefficients of correlation | ||||
| cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37 | ||||
|  | ||||
|  | ||||
| # Iterate over all columns and calculate correlations | ||||
| cors <- numeric() | ||||
|  | ||||
| for (i in 1:ncol(aaData)) { | ||||
|   cors[i] <- cor(PC1, aaData[ , i]) | ||||
| } | ||||
|  | ||||
| summary(cors) | ||||
| #    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's | ||||
| # -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13 | ||||
| # | ||||
| #  The max correlation is ~0.6. That is not very high. Which ijndex is it? | ||||
|  | ||||
| which(cors == max(cors, na.rm = TRUE)) | ||||
|  | ||||
| aaindex[[504]]   # Linker propensity ??? | ||||
|  | ||||
| cor(PC1, aaindex[[504]]$I) # Did we get the right index? | ||||
|  | ||||
| # Plot this ... | ||||
| plot(aaPCA$rotation[ , 1], | ||||
|      aaindex[[504]]$I, | ||||
|      type ="n") | ||||
| text(aaPCA$rotation[ , 1], | ||||
|      aaindex[[504]]$I, | ||||
|      labels = rownames(aaPCA$rotation)) | ||||
|  | ||||
| # This is essentially a random correlation but for Cysteine ... | ||||
|  | ||||
|  | ||||
| # ==== Rotation 2 =================== | ||||
| # | ||||
| # same process | ||||
| PC2 <- aaPCA$rotation[ , 2] | ||||
|  | ||||
| cors2 <- numeric() | ||||
|  | ||||
| for (i in 1:ncol(aaData)) { | ||||
|   cors2[i] <- cor(PC2, aaData[ , i]) | ||||
| } | ||||
|  | ||||
| summary(cors2) | ||||
| #     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's | ||||
| # -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13 | ||||
|  | ||||
| # Here we have quite strong correlations | ||||
|  | ||||
| which(cors2 == max(cors2, na.rm = TRUE)) | ||||
|  | ||||
| aaindex[[148]] | ||||
|  | ||||
| # this index itself is correlated with many other indices | ||||
|  | ||||
| cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index | ||||
|  | ||||
| # Plot this too... | ||||
| plot(aaPCA$rotation[ , 2], | ||||
|      aaindex[[148]]$I, | ||||
|      type ="n") | ||||
| text(aaPCA$rotation[ , 2], | ||||
|      aaindex[[148]]$I, | ||||
|      labels = rownames(aaPCA$rotation)) | ||||
|  | ||||
| # This correlates well with hydrophobicity measures. In this case the | ||||
| # PC is to a certain degree interpretable - but this is not always the case | ||||
| # with PCA (see the example of the first PC). | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,161 +1,161 @@ | ||||
| # tocID <- "ABC-Install_all_packages.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              Installing all packages in this course | ||||
| # | ||||
| # Version:  1.0 | ||||
| # | ||||
| # Date:     2021  10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.0    New code | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                          Line | ||||
| #TOC> ---------------------------------------------- | ||||
| #TOC>   1        Packages                         33 | ||||
| #TOC>   2        CRAN packages                    98 | ||||
| #TOC>   3        Bioconductor packages           127 | ||||
| #TOC>   4        Other package sources           142 | ||||
| #TOC>   5        Updating packages               148 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Packages  ============================================================ | ||||
|  | ||||
| # Much of R's functionality is contributed in packages: bundles of R scripts | ||||
| # or code in other languages, pre-configured objects, and datasets. Making this | ||||
| # functionality available is often done by issuing a library(<package-name>) | ||||
| # command, however this is not the preferred way, since it may override other | ||||
| # R functions and it makes it harder to understand where the source code of | ||||
| # a particular function is located. In this course we call the function name | ||||
| # prefixed with the package name and two colons: | ||||
| #   <package-name>::<function-name>() | ||||
| # This is the preferred way, since it is explicit. | ||||
| # | ||||
| # Regardless of which idiom one uses to call the actual function, the package | ||||
| #  needs to be "installed" first, i.e. the code must have been downloaded | ||||
| # from CRAN, or using the BiocManager::install() function. | ||||
| # | ||||
| # This script contains download commands for all packages that are used in the | ||||
| # course. You can execute the script line by line (or even source the entire | ||||
| # script) to make sure all packages can be installed on your computer. Just | ||||
| # one reminder: if you are ever asked to install from source, the correct | ||||
| # answer is usually "no" - except if you really know what you are doing and why. | ||||
| # | ||||
| # Once packages are installed you can get additional information about | ||||
| # the contents of a package with the commands: | ||||
| #  library(help=<package-name>)       # basic information | ||||
| #  browseVignettes("<package-name>")  # available vignettes | ||||
| #  data(package = "<package-name>")   # available datasets | ||||
| # | ||||
| #  ... and you can load data sets with: | ||||
| #  data(<data-set-name>, package = "<package-name>") | ||||
| # | ||||
| #  All packages here are installed only when they have not been installed | ||||
| #  before, using the following idiom: | ||||
| # | ||||
| #     if (! requireNamespace("<package-name>", quietly=TRUE)) { | ||||
| #       install.packages("<package-name>") | ||||
| #     } | ||||
| # | ||||
| #  ... or its BiocManager::install() equivalent: | ||||
| # | ||||
| # if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) { | ||||
| #   BiocManager::install("<bioconductor-package-name>") | ||||
| # } | ||||
| # | ||||
| #  If you want to _force_ a re-installation of the package, simply issue | ||||
| #  the install.packages("<package-name>") command on its own. For compactness | ||||
| #  we wrap the idiom into a function, which can also switch between CRAN | ||||
| #  and BIOconductor sources: | ||||
|  | ||||
| installIfNeeded <- function(package, s = "CRAN") { | ||||
|   # s: "CRAN" or "BIO" | ||||
|   if (s == "CRAN") { | ||||
|     if (! requireNamespace(package, quietly=TRUE)) { | ||||
|       install.packages(package) | ||||
|     } | ||||
|   } else if (s == "BIO") { | ||||
|     if (! requireNamespace("BiocManager", quietly=TRUE)) { | ||||
|       install.packages("BiocManager") | ||||
|     } | ||||
|     if (! requireNamespace(package, quietly=TRUE)) { | ||||
|       BiocManager::install(package) | ||||
|     } | ||||
|   } else { | ||||
|     stop(sprintf("Unknown source \"%s\".", s)) | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    2  CRAN packages  ======================================================= | ||||
|  | ||||
| installIfNeeded("ape") | ||||
| installIfNeeded("BiocManager") | ||||
| installIfNeeded("bio3d") | ||||
| installIfNeeded("evd") | ||||
| installIfNeeded("ggseqlogo") | ||||
| installIfNeeded("ggtern") | ||||
| installIfNeeded("hexbin") | ||||
| installIfNeeded("httr") | ||||
| installIfNeeded("igraph") | ||||
| installIfNeeded("jsonlite") | ||||
| installIfNeeded("magrittr") | ||||
| installIfNeeded("MASS") | ||||
| installIfNeeded("microbenchmark") | ||||
| installIfNeeded("phangorn") | ||||
| installIfNeeded("plotly") | ||||
| installIfNeeded("plotrix") | ||||
| installIfNeeded("profvis") | ||||
| installIfNeeded("robustbase") | ||||
| installIfNeeded("RColorBrewer") | ||||
| installIfNeeded("Rphylip") | ||||
| installIfNeeded("rvest") | ||||
| installIfNeeded("seqinr") | ||||
| installIfNeeded("stringi") | ||||
| installIfNeeded("taxize") | ||||
| installIfNeeded("testthat") | ||||
| installIfNeeded("xml2") | ||||
|  | ||||
| # =    3  Bioconductor packages  =============================================== | ||||
|  | ||||
| installIfNeeded("Biobase",       s = "BIO") | ||||
| installIfNeeded("biomaRt",       s = "BIO") | ||||
| installIfNeeded("Biostrings",    s = "BIO") | ||||
| installIfNeeded("DECIPHER",      s = "BIO") | ||||
| installIfNeeded("GEOquery",      s = "BIO") | ||||
| installIfNeeded("GOSim",         s = "BIO") | ||||
| installIfNeeded("limma",         s = "BIO") | ||||
| installIfNeeded("msa",           s = "BIO") | ||||
| installIfNeeded("org.Sc.sgd.db", s = "BIO") | ||||
| installIfNeeded("prada",         s = "BIO") | ||||
| installIfNeeded("topGO",         s = "BIO") | ||||
|  | ||||
|  | ||||
| # =    4  Other package sources  =============================================== | ||||
|  | ||||
| # Using sources other than CRAN or Bioconductor to download general-purpose | ||||
| # programs that run on your computer is not generally recommended. | ||||
|  | ||||
|  | ||||
| # =    5  Updating packages  =================================================== | ||||
|  | ||||
| # From time to time, update CRAN packages with the following command ... | ||||
|  | ||||
| update.packages() | ||||
|  | ||||
| # ... and also update Bioconductor packages as follows: | ||||
|  | ||||
| BiocManager::install() | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "ABC-Install_all_packages.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              Installing all packages in this course | ||||
| # | ||||
| # Version:  1.0 | ||||
| # | ||||
| # Date:     2021  10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.0    New code | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                          Line | ||||
| #TOC> ---------------------------------------------- | ||||
| #TOC>   1        Packages                         33 | ||||
| #TOC>   2        CRAN packages                    98 | ||||
| #TOC>   3        Bioconductor packages           127 | ||||
| #TOC>   4        Other package sources           142 | ||||
| #TOC>   5        Updating packages               148 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Packages  ============================================================ | ||||
|  | ||||
| # Much of R's functionality is contributed in packages: bundles of R scripts | ||||
| # or code in other languages, pre-configured objects, and datasets. Making this | ||||
| # functionality available is often done by issuing a library(<package-name>) | ||||
| # command, however this is not the preferred way, since it may override other | ||||
| # R functions and it makes it harder to understand where the source code of | ||||
| # a particular function is located. In this course we call the function name | ||||
| # prefixed with the package name and two colons: | ||||
| #   <package-name>::<function-name>() | ||||
| # This is the preferred way, since it is explicit. | ||||
| # | ||||
| # Regardless of which idiom one uses to call the actual function, the package | ||||
| #  needs to be "installed" first, i.e. the code must have been downloaded | ||||
| # from CRAN, or using the BiocManager::install() function. | ||||
| # | ||||
| # This script contains download commands for all packages that are used in the | ||||
| # course. You can execute the script line by line (or even source the entire | ||||
| # script) to make sure all packages can be installed on your computer. Just | ||||
| # one reminder: if you are ever asked to install from source, the correct | ||||
| # answer is usually "no" - except if you really know what you are doing and why. | ||||
| # | ||||
| # Once packages are installed you can get additional information about | ||||
| # the contents of a package with the commands: | ||||
| #  library(help=<package-name>)       # basic information | ||||
| #  browseVignettes("<package-name>")  # available vignettes | ||||
| #  data(package = "<package-name>")   # available datasets | ||||
| # | ||||
| #  ... and you can load data sets with: | ||||
| #  data(<data-set-name>, package = "<package-name>") | ||||
| # | ||||
| #  All packages here are installed only when they have not been installed | ||||
| #  before, using the following idiom: | ||||
| # | ||||
| #     if (! requireNamespace("<package-name>", quietly=TRUE)) { | ||||
| #       install.packages("<package-name>") | ||||
| #     } | ||||
| # | ||||
| #  ... or its BiocManager::install() equivalent: | ||||
| # | ||||
| # if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) { | ||||
| #   BiocManager::install("<bioconductor-package-name>") | ||||
| # } | ||||
| # | ||||
| #  If you want to _force_ a re-installation of the package, simply issue | ||||
| #  the install.packages("<package-name>") command on its own. For compactness | ||||
| #  we wrap the idiom into a function, which can also switch between CRAN | ||||
| #  and BIOconductor sources: | ||||
|  | ||||
| installIfNeeded <- function(package, s = "CRAN") { | ||||
|   # s: "CRAN" or "BIO" | ||||
|   if (s == "CRAN") { | ||||
|     if (! requireNamespace(package, quietly=TRUE)) { | ||||
|       install.packages(package) | ||||
|     } | ||||
|   } else if (s == "BIO") { | ||||
|     if (! requireNamespace("BiocManager", quietly=TRUE)) { | ||||
|       install.packages("BiocManager") | ||||
|     } | ||||
|     if (! requireNamespace(package, quietly=TRUE)) { | ||||
|       BiocManager::install(package) | ||||
|     } | ||||
|   } else { | ||||
|     stop(sprintf("Unknown source \"%s\".", s)) | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    2  CRAN packages  ======================================================= | ||||
|  | ||||
| installIfNeeded("ape") | ||||
| installIfNeeded("BiocManager") | ||||
| installIfNeeded("bio3d") | ||||
| installIfNeeded("evd") | ||||
| installIfNeeded("ggseqlogo") | ||||
| installIfNeeded("ggtern") | ||||
| installIfNeeded("hexbin") | ||||
| installIfNeeded("httr") | ||||
| installIfNeeded("igraph") | ||||
| installIfNeeded("jsonlite") | ||||
| installIfNeeded("magrittr") | ||||
| installIfNeeded("MASS") | ||||
| installIfNeeded("microbenchmark") | ||||
| installIfNeeded("phangorn") | ||||
| installIfNeeded("plotly") | ||||
| installIfNeeded("plotrix") | ||||
| installIfNeeded("profvis") | ||||
| installIfNeeded("robustbase") | ||||
| installIfNeeded("RColorBrewer") | ||||
| installIfNeeded("Rphylip") | ||||
| installIfNeeded("rvest") | ||||
| installIfNeeded("seqinr") | ||||
| installIfNeeded("stringi") | ||||
| installIfNeeded("taxize") | ||||
| installIfNeeded("testthat") | ||||
| installIfNeeded("xml2") | ||||
|  | ||||
| # =    3  Bioconductor packages  =============================================== | ||||
|  | ||||
| installIfNeeded("Biobase",       s = "BIO") | ||||
| installIfNeeded("biomaRt",       s = "BIO") | ||||
| installIfNeeded("Biostrings",    s = "BIO") | ||||
| installIfNeeded("DECIPHER",      s = "BIO") | ||||
| installIfNeeded("GEOquery",      s = "BIO") | ||||
| installIfNeeded("GOSim",         s = "BIO") | ||||
| installIfNeeded("limma",         s = "BIO") | ||||
| installIfNeeded("msa",           s = "BIO") | ||||
| installIfNeeded("org.Sc.sgd.db", s = "BIO") | ||||
| installIfNeeded("prada",         s = "BIO") | ||||
| installIfNeeded("topGO",         s = "BIO") | ||||
|  | ||||
|  | ||||
| # =    4  Other package sources  =============================================== | ||||
|  | ||||
| # Using sources other than CRAN or Bioconductor to download general-purpose | ||||
| # programs that run on your computer is not generally recommended. | ||||
|  | ||||
|  | ||||
| # =    5  Updating packages  =================================================== | ||||
|  | ||||
| # From time to time, update CRAN packages with the following command ... | ||||
|  | ||||
| update.packages() | ||||
|  | ||||
| # ... and also update Bioconductor packages as follows: | ||||
|  | ||||
| BiocManager::install() | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,100 +1,100 @@ | ||||
| # addSACCE_APSESproteins.R | ||||
| # Adds the Saccharomyces cerevisiae APSES proteins to myDB | ||||
| # | ||||
|  | ||||
| myDB$protein <- | ||||
|     rbind(myDB$protein, | ||||
|           data.frame( | ||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||
|               name = "SWI4_SACCE", | ||||
|               RefSeqID = "NP_011036", | ||||
|               UniProtID = "P25302", | ||||
|               taxonomy.ID = as.integer(4932), | ||||
|               sequence = dbSanitizeSequence(" | ||||
|         1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv | ||||
|        61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf | ||||
|        121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss | ||||
|        181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn | ||||
|        241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln | ||||
|        301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs | ||||
|        361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd | ||||
|        421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey | ||||
|        481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml | ||||
|        541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie | ||||
|        601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr | ||||
|        661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq | ||||
|        721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl | ||||
|        781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns | ||||
|        841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve | ||||
|        901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl | ||||
|        961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt | ||||
|        1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids | ||||
|        1081 klddiekdlr ana"), | ||||
|               stringsAsFactors = FALSE)) | ||||
|  | ||||
| myDB$protein <- | ||||
|     rbind(myDB$protein, | ||||
|           data.frame( | ||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||
|               name = "PHD1_SACCE", | ||||
|               RefSeqID = "NP_012881", | ||||
|               UniProtID = "P36093", | ||||
|               taxonomy.ID = as.integer(4932), | ||||
|               sequence = dbSanitizeSequence(" | ||||
|         1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv | ||||
|        61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm | ||||
|       121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst | ||||
|       181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs | ||||
|       241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl | ||||
|       301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr | ||||
|       361 aknels"), | ||||
|               stringsAsFactors = FALSE)) | ||||
|  | ||||
| myDB$protein <- | ||||
|     rbind(myDB$protein, | ||||
|           data.frame( | ||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||
|               name = "SOK2_SACCE", | ||||
|               RefSeqID = "NP_013729", | ||||
|               UniProtID = "P53438", | ||||
|               taxonomy.ID = as.integer(4932), | ||||
|               sequence = dbSanitizeSequence(" | ||||
|         1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq | ||||
|        61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna | ||||
|       121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt | ||||
|       181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp | ||||
|       241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip | ||||
|       301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt | ||||
|       361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw | ||||
|       421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm | ||||
|       481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp | ||||
|       541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp | ||||
|       601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp | ||||
|       661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq | ||||
|       721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt | ||||
|       781 kkqek"), | ||||
|               stringsAsFactors = FALSE)) | ||||
|  | ||||
| myDB$protein <- | ||||
|     rbind(myDB$protein, | ||||
|           data.frame( | ||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||
|               name = "XBP1_SACCE", | ||||
|               RefSeqID = "NP_012165", | ||||
|               UniProtID = "P40489", | ||||
|               taxonomy.ID = as.integer(4932), | ||||
|               sequence = dbSanitizeSequence(" | ||||
|         1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf | ||||
|        61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal | ||||
|       121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh | ||||
|       181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae | ||||
|       241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann | ||||
|       301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl | ||||
|       361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa | ||||
|       421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd | ||||
|       481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr | ||||
|       541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh | ||||
|       601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"), | ||||
|               stringsAsFactors = FALSE)) | ||||
|  | ||||
| # [END] | ||||
| # addSACCE_APSESproteins.R | ||||
| # Adds the Saccharomyces cerevisiae APSES proteins to myDB | ||||
| # | ||||
|  | ||||
| myDB$protein <- | ||||
|     rbind(myDB$protein, | ||||
|           data.frame( | ||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||
|               name = "SWI4_SACCE", | ||||
|               RefSeqID = "NP_011036", | ||||
|               UniProtID = "P25302", | ||||
|               taxonomy.ID = as.integer(4932), | ||||
|               sequence = dbSanitizeSequence(" | ||||
|         1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv | ||||
|        61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf | ||||
|        121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss | ||||
|        181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn | ||||
|        241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln | ||||
|        301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs | ||||
|        361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd | ||||
|        421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey | ||||
|        481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml | ||||
|        541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie | ||||
|        601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr | ||||
|        661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq | ||||
|        721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl | ||||
|        781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns | ||||
|        841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve | ||||
|        901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl | ||||
|        961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt | ||||
|        1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids | ||||
|        1081 klddiekdlr ana"), | ||||
|               stringsAsFactors = FALSE)) | ||||
|  | ||||
| myDB$protein <- | ||||
|     rbind(myDB$protein, | ||||
|           data.frame( | ||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||
|               name = "PHD1_SACCE", | ||||
|               RefSeqID = "NP_012881", | ||||
|               UniProtID = "P36093", | ||||
|               taxonomy.ID = as.integer(4932), | ||||
|               sequence = dbSanitizeSequence(" | ||||
|         1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv | ||||
|        61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm | ||||
|       121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst | ||||
|       181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs | ||||
|       241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl | ||||
|       301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr | ||||
|       361 aknels"), | ||||
|               stringsAsFactors = FALSE)) | ||||
|  | ||||
| myDB$protein <- | ||||
|     rbind(myDB$protein, | ||||
|           data.frame( | ||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||
|               name = "SOK2_SACCE", | ||||
|               RefSeqID = "NP_013729", | ||||
|               UniProtID = "P53438", | ||||
|               taxonomy.ID = as.integer(4932), | ||||
|               sequence = dbSanitizeSequence(" | ||||
|         1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq | ||||
|        61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna | ||||
|       121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt | ||||
|       181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp | ||||
|       241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip | ||||
|       301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt | ||||
|       361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw | ||||
|       421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm | ||||
|       481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp | ||||
|       541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp | ||||
|       601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp | ||||
|       661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq | ||||
|       721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt | ||||
|       781 kkqek"), | ||||
|               stringsAsFactors = FALSE)) | ||||
|  | ||||
| myDB$protein <- | ||||
|     rbind(myDB$protein, | ||||
|           data.frame( | ||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||
|               name = "XBP1_SACCE", | ||||
|               RefSeqID = "NP_012165", | ||||
|               UniProtID = "P40489", | ||||
|               taxonomy.ID = as.integer(4932), | ||||
|               sequence = dbSanitizeSequence(" | ||||
|         1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf | ||||
|        61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal | ||||
|       121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh | ||||
|       181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae | ||||
|       241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann | ||||
|       301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl | ||||
|       361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa | ||||
|       421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd | ||||
|       481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr | ||||
|       541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh | ||||
|       601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"), | ||||
|               stringsAsFactors = FALSE)) | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										138
									
								
								ABC-units.R
									
									
									
									
									
								
							
							
						
						
									
										138
									
								
								ABC-units.R
									
									
									
									
									
								
							| @@ -1,69 +1,69 @@ | ||||
| # ABC-units.R | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: R code for learning units | ||||
| # | ||||
| # Version: 4.0 | ||||
| # | ||||
| # Date:    2020  09  16 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| # V 4.0    2020 version | ||||
| # V 3.0    2019 version | ||||
| # V 2.0    2018 version | ||||
| # V 1.0    2017 version | ||||
| # V 0.1    First code | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # The R-scripts and datasets in this project will be continuously updated, | ||||
| # and updates will be posted on GitHub. To bring your version into the latest | ||||
| # state use the Git-pane (top left) and "pull" (blue downward arrow) from the | ||||
| # repository. However, this will overwrite locally edited version of files. | ||||
|  | ||||
| # To edit code and experiment with it, for example to add your own comments and | ||||
| # examples, save your edited version into the "myScripts" folder. Otherwise you | ||||
| # may have problems with git when you update the project to a new version. It's | ||||
| # good practice to change the filename, for example by prepending your initials. | ||||
| # This helps distinguish the files you are working with e.g. in a list of | ||||
| # recent files. For example if your name is Honjo Tasuku, your edited | ||||
| # BIN-Sequence.R might be named HT-BIN-Sequence.R | ||||
|  | ||||
| # If you pull from github and get the following type of error ... | ||||
| #     --------------- | ||||
| #     error: Your local changes to the following files would be | ||||
| #     overwritten by merge | ||||
| #     ... | ||||
| #     Please commit your changes or stash them before you can merge. | ||||
| #     --------------- | ||||
| # ... then, you need to bring the offending file into its original state. | ||||
| # Open the Commit window, select the file, and click on the Revert button. | ||||
| # | ||||
| # When working with these script DO NOT SIMPLY  source()  THESE FILES! | ||||
|  | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| # Once you have typed and executed the function init(), you will find a file | ||||
| # called myScript.R in the project directory. | ||||
| # | ||||
| # Open it, you can place all of your code-experiments and notes into that | ||||
| # file. This will complement your "Course Journal". If you keep all contents in | ||||
| # this one file, you can find everything by using the <cmd>-F find function. To | ||||
| # cross-reference code in your journal, create section headings. | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| # The individual learning units' files can be opened by simply clicking on them | ||||
| # in the File pane. | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # ABC-units.R | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: R code for learning units | ||||
| # | ||||
| # Version: 4.0 | ||||
| # | ||||
| # Date:    2020  09  16 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| # V 4.0    2020 version | ||||
| # V 3.0    2019 version | ||||
| # V 2.0    2018 version | ||||
| # V 1.0    2017 version | ||||
| # V 0.1    First code | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # The R-scripts and datasets in this project will be continuously updated, | ||||
| # and updates will be posted on GitHub. To bring your version into the latest | ||||
| # state use the Git-pane (top left) and "pull" (blue downward arrow) from the | ||||
| # repository. However, this will overwrite locally edited version of files. | ||||
|  | ||||
| # To edit code and experiment with it, for example to add your own comments and | ||||
| # examples, save your edited version into the "myScripts" folder. Otherwise you | ||||
| # may have problems with git when you update the project to a new version. It's | ||||
| # good practice to change the filename, for example by prepending your initials. | ||||
| # This helps distinguish the files you are working with e.g. in a list of | ||||
| # recent files. For example if your name is Honjo Tasuku, your edited | ||||
| # BIN-Sequence.R might be named HT-BIN-Sequence.R | ||||
|  | ||||
| # If you pull from github and get the following type of error ... | ||||
| #     --------------- | ||||
| #     error: Your local changes to the following files would be | ||||
| #     overwritten by merge | ||||
| #     ... | ||||
| #     Please commit your changes or stash them before you can merge. | ||||
| #     --------------- | ||||
| # ... then, you need to bring the offending file into its original state. | ||||
| # Open the Commit window, select the file, and click on the Revert button. | ||||
| # | ||||
| # When working with these script DO NOT SIMPLY  source()  THESE FILES! | ||||
|  | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| # Once you have typed and executed the function init(), you will find a file | ||||
| # called myScript.R in the project directory. | ||||
| # | ||||
| # Open it, you can place all of your code-experiments and notes into that | ||||
| # file. This will complement your "Course Journal". If you keep all contents in | ||||
| # this one file, you can find everything by using the <cmd>-F find function. To | ||||
| # cross-reference code in your journal, create section headings. | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| # The individual learning units' files can be opened by simply clicking on them | ||||
| # in the File pane. | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,16 +1,16 @@ | ||||
| Version: 1.0 | ||||
|  | ||||
| RestoreWorkspace: No | ||||
| SaveWorkspace: No | ||||
| AlwaysSaveHistory: No | ||||
|  | ||||
| EnableCodeIndexing: Yes | ||||
| UseSpacesForTab: Yes | ||||
| NumSpacesForTab: 2 | ||||
| Encoding: UTF-8 | ||||
|  | ||||
| RnwWeave: knitr | ||||
| LaTeX: XeLaTeX | ||||
|  | ||||
| AutoAppendNewline: Yes | ||||
| StripTrailingWhitespace: Yes | ||||
| Version: 1.0 | ||||
|  | ||||
| RestoreWorkspace: No | ||||
| SaveWorkspace: No | ||||
| AlwaysSaveHistory: No | ||||
|  | ||||
| EnableCodeIndexing: Yes | ||||
| UseSpacesForTab: Yes | ||||
| NumSpacesForTab: 2 | ||||
| Encoding: UTF-8 | ||||
|  | ||||
| RnwWeave: knitr | ||||
| LaTeX: XeLaTeX | ||||
|  | ||||
| AutoAppendNewline: Yes | ||||
| StripTrailingWhitespace: Yes | ||||
|   | ||||
							
								
								
									
										222
									
								
								BIN-ALI-BLAST.R
									
									
									
									
									
								
							
							
						
						
									
										222
									
								
								BIN-ALI-BLAST.R
									
									
									
									
									
								
							| @@ -1,111 +1,111 @@ | ||||
| # tocID <- "BIN-ALI-BLAST.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-ALI-BLAST unit. | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # Version:  1.3 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.3    2020 Maintenance | ||||
| #           1.2    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.1    Fixed parsing logic. | ||||
| #           1.0    First live version 2017. | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                               Line | ||||
| #TOC> --------------------------------------------------- | ||||
| #TOC>   1        Defining the APSES domain             45 | ||||
| #TOC>   2        Executing the BLAST search            75 | ||||
| #TOC>   3        Analysing results                     97 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Defining the APSES domain  =========================================== | ||||
|  | ||||
| # Load your protein database | ||||
| source("makeProteinDB.R") | ||||
|  | ||||
| # Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You | ||||
| # have entered this data into your database in the | ||||
| # BIN-ALI-Optimal_sequence_alignment unit.) | ||||
|  | ||||
| ( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct | ||||
|                                                 # name of the Mbp1 orthologue | ||||
|                                                 # of Mbp1 in your protein | ||||
|                                                 # database, DON'T continue. We | ||||
|                                                 # need to fix this problem. | ||||
|                                                 # Get in touch. | ||||
|  | ||||
| (proID <- myDB$protein$ID[myDB$protein$name == myOrth]) | ||||
| (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||
| (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | ||||
|                                myDB$annotation$featureID == ftrID]) | ||||
| (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | ||||
| (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | ||||
| (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | ||||
|                  start, | ||||
|                  end)) | ||||
|  | ||||
| # The MYSPE "apses" sequence is the sequence that we will use for our reverse | ||||
| # BLAST search. | ||||
|  | ||||
|  | ||||
| # =    2  Executing the BLAST search  ========================================== | ||||
|  | ||||
| # The ./scripts/BLAST.R code defines two functions to access the BLAST interface | ||||
| # through its Web API, and to parse results. Have a look at the script, then | ||||
| # source it: | ||||
|  | ||||
| source("./scripts/BLAST.R") | ||||
|  | ||||
| # Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces | ||||
| # cerevisiae: | ||||
|  | ||||
| BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence | ||||
|                      db = "refseq_protein",        # database to search in | ||||
|                      nHits = 10,                   # | ||||
|                      E = 0.01,                     # | ||||
|                      limits = "txid559292[ORGN]")  # S. cerevisiae S288c | ||||
|  | ||||
|  | ||||
| length(BLASTresults$hits)  # There should be at least one hit there. Ask for | ||||
|                            # advice in case this step fails. | ||||
|  | ||||
|  | ||||
| # =    3  Analysing results  =================================================== | ||||
|  | ||||
| (topHit <- BLASTresults$hits[[1]])   # Get the top hit | ||||
|  | ||||
| # What is the refseq ID of the top hit | ||||
| topHit$accession | ||||
|  | ||||
| # If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses | ||||
| # domain. If it is not, ask me for advice. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-ALI-BLAST.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-ALI-BLAST unit. | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # Version:  1.3 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.3    2020 Maintenance | ||||
| #           1.2    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.1    Fixed parsing logic. | ||||
| #           1.0    First live version 2017. | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                               Line | ||||
| #TOC> --------------------------------------------------- | ||||
| #TOC>   1        Defining the APSES domain             45 | ||||
| #TOC>   2        Executing the BLAST search            75 | ||||
| #TOC>   3        Analysing results                     97 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Defining the APSES domain  =========================================== | ||||
|  | ||||
| # Load your protein database | ||||
| source("makeProteinDB.R") | ||||
|  | ||||
| # Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You | ||||
| # have entered this data into your database in the | ||||
| # BIN-ALI-Optimal_sequence_alignment unit.) | ||||
|  | ||||
| ( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct | ||||
|                                                 # name of the Mbp1 orthologue | ||||
|                                                 # of Mbp1 in your protein | ||||
|                                                 # database, DON'T continue. We | ||||
|                                                 # need to fix this problem. | ||||
|                                                 # Get in touch. | ||||
|  | ||||
| (proID <- myDB$protein$ID[myDB$protein$name == myOrth]) | ||||
| (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||
| (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | ||||
|                                myDB$annotation$featureID == ftrID]) | ||||
| (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | ||||
| (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | ||||
| (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | ||||
|                  start, | ||||
|                  end)) | ||||
|  | ||||
| # The MYSPE "apses" sequence is the sequence that we will use for our reverse | ||||
| # BLAST search. | ||||
|  | ||||
|  | ||||
| # =    2  Executing the BLAST search  ========================================== | ||||
|  | ||||
| # The ./scripts/BLAST.R code defines two functions to access the BLAST interface | ||||
| # through its Web API, and to parse results. Have a look at the script, then | ||||
| # source it: | ||||
|  | ||||
| source("./scripts/BLAST.R") | ||||
|  | ||||
| # Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces | ||||
| # cerevisiae: | ||||
|  | ||||
| BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence | ||||
|                      db = "refseq_protein",        # database to search in | ||||
|                      nHits = 10,                   # | ||||
|                      E = 0.01,                     # | ||||
|                      limits = "txid559292[ORGN]")  # S. cerevisiae S288c | ||||
|  | ||||
|  | ||||
| length(BLASTresults$hits)  # There should be at least one hit there. Ask for | ||||
|                            # advice in case this step fails. | ||||
|  | ||||
|  | ||||
| # =    3  Analysing results  =================================================== | ||||
|  | ||||
| (topHit <- BLASTresults$hits[[1]])   # Get the top hit | ||||
|  | ||||
| # What is the refseq ID of the top hit | ||||
| topHit$accession | ||||
|  | ||||
| # If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses | ||||
| # domain. If it is not, ask me for advice. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,195 +1,195 @@ | ||||
| # tocID <- "BIN-ALI-Dotplot.R" | ||||
| # | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-ALI-Dotplot unit. | ||||
| # | ||||
| # Version:  0.2 | ||||
| # | ||||
| # Date:     2019  01  07 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           0.2    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                  Line | ||||
| #TOC> -------------------------------------- | ||||
| #TOC>   1        ___Section___            42 | ||||
| #TOC>   2        Tasks                   190 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  ___Section___  ======================================================= | ||||
|  | ||||
| if (!requireNamespace("BiocManager", quietly=TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (!requireNamespace("Biostrings", quietly=TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
| if (!requireNamespace("seqinr", quietly=TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
|  | ||||
|  | ||||
| # Let's load BLOSUM62 | ||||
| data(BLOSUM62, package = "Biostrings") | ||||
|  | ||||
| # Now let's craft code for a dotplot. That's surprisingly simple. We build a | ||||
| # matrix that has as many rows as one sequence, as many columns as another. Then | ||||
| # we go through every cell of the matrix and enter the pairscore we encounter | ||||
| # for the amino acid pair whose position corresponds to the row and column | ||||
| # index. Finally we visualize the matrix in a plot. | ||||
| # | ||||
|  | ||||
| # First we fetch our sequences and split them into single characters. | ||||
| sel <- myDB$protein$name == "MBP1_SACCE" | ||||
| MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel]) | ||||
|  | ||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||
| MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel]) | ||||
|  | ||||
| # Check that we have two character vectors of the expected length. | ||||
| str(MBP1_SACCE) | ||||
| str(MBP1_MYSPE) | ||||
|  | ||||
| # How do we get the pairscore values? Consider: a single pair of amino acids can | ||||
| # be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ... | ||||
| MBP1_SACCE[13] | ||||
| MBP1_MYSPE[21] | ||||
|  | ||||
| # ... using these as subsetting expressions, we can pull the pairscore | ||||
| # from the MDM | ||||
| BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]] | ||||
|  | ||||
| # First we build an empty matrix that will hold all pairscores ... | ||||
| dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)), | ||||
|                  nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE)) | ||||
|  | ||||
| # ... then we loop over the sequences and store the scores in the matrix. | ||||
| # | ||||
| for (i in 1:length(MBP1_SACCE)) { | ||||
|   for (j in 1:length(MBP1_MYSPE)) { | ||||
|     dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]] | ||||
|   } | ||||
| } | ||||
|  | ||||
| # Even though this is a large matrix, this does not take much time ... | ||||
| # Let's have a look at a small block of the values: | ||||
|  | ||||
| dotMat[1:10, 1:10] | ||||
|  | ||||
| # Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in | ||||
| # the matrix correspond to an amino acid from MBP1_MYSPE. | ||||
|  | ||||
| # To plot this, we use the image() function. Here, with default parameters. | ||||
|  | ||||
| image(dotMat) | ||||
|  | ||||
| # Be patient, this takes a few moments to render: more than 500,000 values. | ||||
| # Nice. | ||||
| # What do you expect? | ||||
| # What would similar sequences look like? | ||||
| # What do you see? | ||||
|  | ||||
| #You migh notice a thin line of yellow along the diagonal, moving approximately | ||||
| # from bottom left to top right, fading in and out of existence. This is the | ||||
| # signature of extended sequence similarity. | ||||
|  | ||||
| # Let's magnify this a bit by looking at only the first 200 amino acids ... | ||||
| image(dotMat[1:200, 1:200]) | ||||
|  | ||||
| # ... and, according to our normal writing convention, we would like the | ||||
| # diagonal to run from top-left to bottom-right since we write from left to | ||||
| # right and from top to bottom... | ||||
| image(dotMat[1:200, 1:200], ylim = 1.0:0.0) | ||||
|  | ||||
| # ... and we would like the range of the x- and y- axis to correspond to the | ||||
| # sequence position ... | ||||
| image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1)) | ||||
|  | ||||
| # ... and labels! Axis labels would be nice ... | ||||
| image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1), | ||||
|       xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" ) | ||||
|  | ||||
| # ... and why don't we have axis-numbers on all four sides? Go, make that right | ||||
| # too ... | ||||
| len <- 200 | ||||
| image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1), | ||||
|       xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE) | ||||
| box() | ||||
| axis(1, at = c(1, seq(10, len, by=10))) | ||||
| axis(2, at = c(1, seq(10, len, by=10))) | ||||
| axis(3, at = c(1, seq(10, len, by=10))) | ||||
| axis(4, at = c(1, seq(10, len, by=10))) | ||||
|  | ||||
| # ... you get the idea, we can infinitely customize our plot. However a good way | ||||
| # to do this is to develop a particular view for, say, a report or publication | ||||
| # in a script and then put it into a function. I have put a function into the | ||||
| # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because | ||||
| # there already is a dotplot function in the seqinr package: | ||||
|  | ||||
| seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr | ||||
| dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's | ||||
|  | ||||
| # Which one do you prefer? You can probably see the block patterns that arise | ||||
| # from segments of repetitive, low complexity sequence. But you probably have to | ||||
| # look very closely to discern the faint diagonals that correspond to similar | ||||
| # sequence. | ||||
|  | ||||
|  | ||||
| # Let's see if we can enhance the contrast between distributed noise and the | ||||
| # actual alignment of conserved residues. We can filter the dot matrix with a | ||||
| # pattern that enhances diagonally repeated values. Every value in the matrix | ||||
| # will be replaced by a weighted average of its neighborhood. Here is  a | ||||
| # diagonal-filter: | ||||
|  | ||||
| myFilter <- matrix(numeric(25), nrow = 5) | ||||
| myFilter[1, ] <- c( 1, 0, 0, 0, 0) | ||||
| myFilter[2, ] <- c( 0, 1, 0, 0, 0) | ||||
| myFilter[3, ] <- c( 0, 0, 1, 0, 0) | ||||
| myFilter[4, ] <- c( 0, 0, 0, 1, 0) | ||||
| myFilter[5, ] <- c( 0, 0, 0, 0, 1) | ||||
|  | ||||
| # I have added the option to read such filters (or others that you could define on your own) as a parameter of the function. | ||||
|  | ||||
| dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter) | ||||
|  | ||||
| # I think the result shows quite nicely how the two sequences are globally | ||||
| # related and where the regions of sequence similarity are. Play with this a bit | ||||
| # ...  Can you come up with a better filter? If so, eMail us. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    2  Tasks  =============================================================== | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-ALI-Dotplot.R" | ||||
| # | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-ALI-Dotplot unit. | ||||
| # | ||||
| # Version:  0.2 | ||||
| # | ||||
| # Date:     2019  01  07 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           0.2    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                  Line | ||||
| #TOC> -------------------------------------- | ||||
| #TOC>   1        ___Section___            42 | ||||
| #TOC>   2        Tasks                   190 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  ___Section___  ======================================================= | ||||
|  | ||||
| if (!requireNamespace("BiocManager", quietly=TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (!requireNamespace("Biostrings", quietly=TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
| if (!requireNamespace("seqinr", quietly=TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
|  | ||||
|  | ||||
| # Let's load BLOSUM62 | ||||
| data(BLOSUM62, package = "Biostrings") | ||||
|  | ||||
| # Now let's craft code for a dotplot. That's surprisingly simple. We build a | ||||
| # matrix that has as many rows as one sequence, as many columns as another. Then | ||||
| # we go through every cell of the matrix and enter the pairscore we encounter | ||||
| # for the amino acid pair whose position corresponds to the row and column | ||||
| # index. Finally we visualize the matrix in a plot. | ||||
| # | ||||
|  | ||||
| # First we fetch our sequences and split them into single characters. | ||||
| sel <- myDB$protein$name == "MBP1_SACCE" | ||||
| MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel]) | ||||
|  | ||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||
| MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel]) | ||||
|  | ||||
| # Check that we have two character vectors of the expected length. | ||||
| str(MBP1_SACCE) | ||||
| str(MBP1_MYSPE) | ||||
|  | ||||
| # How do we get the pairscore values? Consider: a single pair of amino acids can | ||||
| # be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ... | ||||
| MBP1_SACCE[13] | ||||
| MBP1_MYSPE[21] | ||||
|  | ||||
| # ... using these as subsetting expressions, we can pull the pairscore | ||||
| # from the MDM | ||||
| BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]] | ||||
|  | ||||
| # First we build an empty matrix that will hold all pairscores ... | ||||
| dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)), | ||||
|                  nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE)) | ||||
|  | ||||
| # ... then we loop over the sequences and store the scores in the matrix. | ||||
| # | ||||
| for (i in 1:length(MBP1_SACCE)) { | ||||
|   for (j in 1:length(MBP1_MYSPE)) { | ||||
|     dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]] | ||||
|   } | ||||
| } | ||||
|  | ||||
| # Even though this is a large matrix, this does not take much time ... | ||||
| # Let's have a look at a small block of the values: | ||||
|  | ||||
| dotMat[1:10, 1:10] | ||||
|  | ||||
| # Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in | ||||
| # the matrix correspond to an amino acid from MBP1_MYSPE. | ||||
|  | ||||
| # To plot this, we use the image() function. Here, with default parameters. | ||||
|  | ||||
| image(dotMat) | ||||
|  | ||||
| # Be patient, this takes a few moments to render: more than 500,000 values. | ||||
| # Nice. | ||||
| # What do you expect? | ||||
| # What would similar sequences look like? | ||||
| # What do you see? | ||||
|  | ||||
| #You migh notice a thin line of yellow along the diagonal, moving approximately | ||||
| # from bottom left to top right, fading in and out of existence. This is the | ||||
| # signature of extended sequence similarity. | ||||
|  | ||||
| # Let's magnify this a bit by looking at only the first 200 amino acids ... | ||||
| image(dotMat[1:200, 1:200]) | ||||
|  | ||||
| # ... and, according to our normal writing convention, we would like the | ||||
| # diagonal to run from top-left to bottom-right since we write from left to | ||||
| # right and from top to bottom... | ||||
| image(dotMat[1:200, 1:200], ylim = 1.0:0.0) | ||||
|  | ||||
| # ... and we would like the range of the x- and y- axis to correspond to the | ||||
| # sequence position ... | ||||
| image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1)) | ||||
|  | ||||
| # ... and labels! Axis labels would be nice ... | ||||
| image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1), | ||||
|       xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" ) | ||||
|  | ||||
| # ... and why don't we have axis-numbers on all four sides? Go, make that right | ||||
| # too ... | ||||
| len <- 200 | ||||
| image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1), | ||||
|       xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE) | ||||
| box() | ||||
| axis(1, at = c(1, seq(10, len, by=10))) | ||||
| axis(2, at = c(1, seq(10, len, by=10))) | ||||
| axis(3, at = c(1, seq(10, len, by=10))) | ||||
| axis(4, at = c(1, seq(10, len, by=10))) | ||||
|  | ||||
| # ... you get the idea, we can infinitely customize our plot. However a good way | ||||
| # to do this is to develop a particular view for, say, a report or publication | ||||
| # in a script and then put it into a function. I have put a function into the | ||||
| # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because | ||||
| # there already is a dotplot function in the seqinr package: | ||||
|  | ||||
| seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr | ||||
| dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's | ||||
|  | ||||
| # Which one do you prefer? You can probably see the block patterns that arise | ||||
| # from segments of repetitive, low complexity sequence. But you probably have to | ||||
| # look very closely to discern the faint diagonals that correspond to similar | ||||
| # sequence. | ||||
|  | ||||
|  | ||||
| # Let's see if we can enhance the contrast between distributed noise and the | ||||
| # actual alignment of conserved residues. We can filter the dot matrix with a | ||||
| # pattern that enhances diagonally repeated values. Every value in the matrix | ||||
| # will be replaced by a weighted average of its neighborhood. Here is  a | ||||
| # diagonal-filter: | ||||
|  | ||||
| myFilter <- matrix(numeric(25), nrow = 5) | ||||
| myFilter[1, ] <- c( 1, 0, 0, 0, 0) | ||||
| myFilter[2, ] <- c( 0, 1, 0, 0, 0) | ||||
| myFilter[3, ] <- c( 0, 0, 1, 0, 0) | ||||
| myFilter[4, ] <- c( 0, 0, 0, 1, 0) | ||||
| myFilter[5, ] <- c( 0, 0, 0, 0, 1) | ||||
|  | ||||
| # I have added the option to read such filters (or others that you could define on your own) as a parameter of the function. | ||||
|  | ||||
| dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter) | ||||
|  | ||||
| # I think the result shows quite nicely how the two sequences are globally | ||||
| # related and where the regions of sequence similarity are. Play with this a bit | ||||
| # ...  Can you come up with a better filter? If so, eMail us. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    2  Tasks  =============================================================== | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										1256
									
								
								BIN-ALI-MSA.R
									
									
									
									
									
								
							
							
						
						
									
										1256
									
								
								BIN-ALI-MSA.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,365 +1,365 @@ | ||||
| # tocID <- "BIN-ALI-Optimal_sequence_alignment.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit. | ||||
| # | ||||
| # ============================================================================== | ||||
| # Version:  1.7.1 | ||||
| # | ||||
| # Date:     2017-09   -   2020-10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/ | ||||
| #           1.7    2020 updates | ||||
| #           1.6    Maintenance | ||||
| #           1.5    Change from require() to requireNamespace(), | ||||
| #                    use <package>::<function>() idiom throughout | ||||
| #           1.4    Pull s2c() from seqinr package, rather then loading the | ||||
| #                    entire library. | ||||
| #           1.3    Updated confirmation task with correct logic | ||||
| #           1.2    Added missing load of seqinr package | ||||
| #           1.1    Update annotation file logic - it could already have been | ||||
| #                    prepared in the BIN-FUNC-Annotation unit. | ||||
| #           1.0.1  bugfix | ||||
| #           1.0    First 2017 live version. | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                                      Line | ||||
| #TOC> -------------------------------------------------------------------------- | ||||
| #TOC>   1        Prepare                                                      58 | ||||
| #TOC>   2        Biostrings Pairwise Alignment                                75 | ||||
| #TOC>   2.1        Optimal global alignment                                   93 | ||||
| #TOC>   2.2        Optimal local alignment                                   156 | ||||
| #TOC>   3        APSES Domain annotation by alignment                        180 | ||||
| #TOC>   4        Update your database script                                 261 | ||||
| #TOC>   4.1        Preparing an annotation file ...                          267 | ||||
| #TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269 | ||||
| #TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314 | ||||
| #TOC>   4.2        Execute and Validate                                      338 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Prepare  ============================================================= | ||||
|  | ||||
| if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
| # You can get package information with the following commands: | ||||
| # library(help = seqinr)       # basic information | ||||
| # browseVignettes("seqinr")    # available vignettes | ||||
| # data(package = "seqinr")     # available datasets | ||||
|  | ||||
|  | ||||
| # You need to recreate the protein database that you have constructed in the | ||||
| # BIN-Storing_data unit. | ||||
|  | ||||
| source("./myScripts/makeProteinDB.R") | ||||
|  | ||||
|  | ||||
| # =    2  Biostrings Pairwise Alignment  ======================================= | ||||
|  | ||||
|  | ||||
| if (!requireNamespace("BiocManager", quietly=TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (!requireNamespace("Biostrings", quietly=TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # Biostrings stores sequences in "XString" objects. Once we have converted our | ||||
| # target sequences to AAString objects, the alignment itself is straightforward. | ||||
|  | ||||
| # ==   2.1  Optimal global alignment  ========================================== | ||||
|  | ||||
| # The pairwiseAlignment() function was written to behave | ||||
| # exactly like the functions you encountered on the EMBOSS server. | ||||
|  | ||||
| # First: make AAString objects ... | ||||
| sel <- myDB$protein$name == "MBP1_SACCE" | ||||
| aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel]) | ||||
|  | ||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||
| aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel]) | ||||
|  | ||||
| ?pairwiseAlignment | ||||
| # ... and align. | ||||
| # Global optimal alignment with end-gap penalties is default. | ||||
| ali1 <-  Biostrings::pairwiseAlignment( | ||||
|   aaMBP1_SACCE, | ||||
|   aaMBP1_MYSPE, | ||||
|   substitutionMatrix = "BLOSUM62", | ||||
|   gapOpening = 10, | ||||
|   gapExtension = 0.5) | ||||
|  | ||||
| str(ali1)  # ... it's complicated | ||||
|  | ||||
| # This is a Biostrings alignment object. But we can use Biostrings functions to | ||||
| # tame it: | ||||
| ali1 | ||||
| Biostrings::writePairwiseAlignments(ali1)   # That should look familiar | ||||
|  | ||||
| # And we can make the internal structure work for us  (@ is for classes as | ||||
| # $ is for lists ...) | ||||
| str(ali1@pattern) | ||||
| ali1@pattern | ||||
| ali1@pattern@range | ||||
| ali1@pattern@indel | ||||
| ali1@pattern@mismatch | ||||
|  | ||||
| # or work with "normal" R functions | ||||
| # the alignment length | ||||
| nchar(as.character(ali1@pattern)) | ||||
|  | ||||
| # the number of identities | ||||
| sum(seqinr::s2c(as.character(ali1@pattern)) == | ||||
|     seqinr::s2c(as.character(ali1@subject))) | ||||
|  | ||||
| # ... e.g. to calculate the percentage of identities | ||||
| 100 * | ||||
|   sum(seqinr::s2c(as.character(ali1@pattern)) == | ||||
|       seqinr::s2c(as.character(ali1@subject))) / | ||||
|   nchar(as.character(ali1@pattern)) | ||||
| # ... which should be the same as reported in the writePairwiseAlignments() | ||||
| # output. Awkward to type? Then it calls for a function: | ||||
| # | ||||
| percentID <- function(al) { | ||||
|   # returns the percent-identity of a Biostrings alignment object | ||||
|   return(100 * | ||||
|          sum(seqinr::s2c(as.character(al@pattern)) == | ||||
|              seqinr::s2c(as.character(al@subject))) / | ||||
|          nchar(as.character(al@pattern))) | ||||
| } | ||||
|  | ||||
| percentID(ali1) | ||||
|  | ||||
| # ==   2.2  Optimal local alignment  =========================================== | ||||
|  | ||||
| # Compare with local optimal alignment (like EMBOSS Water) | ||||
| ali2 <-  Biostrings::pairwiseAlignment( | ||||
|   aaMBP1_SACCE, | ||||
|   aaMBP1_MYSPE, | ||||
|   type = "local", | ||||
|   substitutionMatrix = "BLOSUM62", | ||||
|   gapOpening = 50, | ||||
|   gapExtension = 10) | ||||
|  | ||||
| Biostrings::writePairwiseAlignments(ali2) | ||||
| # This has probably only aligned the N-terminal DNA binding domain - but that | ||||
| # one has quite high sequence identity: | ||||
| percentID(ali2) | ||||
|  | ||||
| # == TASK: == | ||||
|  | ||||
| # Compare the two alignments. I have weighted the local alignment heavily | ||||
| # towards an ungapped alignment by setting very high gap penalties. Try changing | ||||
| # the gap penalties and see what happens: how does the number of indels change, | ||||
| # how does the length of indels change... | ||||
|  | ||||
|  | ||||
| # =    3  APSES Domain annotation by alignment  ================================ | ||||
|  | ||||
| # In this section we define the MYSPE APSES sequence by performing a global, | ||||
| # optimal sequence alignment of the yeast APSES domain with the full length | ||||
| # protein sequence of the protein that was the most similar to the yeast APSES | ||||
| # domain. | ||||
| # | ||||
|  | ||||
| # I have annotated the yeast APSES domain as a feature in the | ||||
| # database. To view the annotation, we can retrieve it via the proteinID and | ||||
| # featureID. Here is the yeast protein ID: | ||||
| (proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"]) | ||||
|  | ||||
|  | ||||
| # ... and if you look at the feature table, you can identify the feature ID | ||||
| (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||
|  | ||||
| # ... and with the two annotations we can get the corresponding ID from the | ||||
| # annotation table | ||||
| (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | ||||
|                              myDB$annotation$featureID == ftrID]) | ||||
|  | ||||
| myDB$annotation[myDB$annotation$ID == proID & | ||||
|                 myDB$annotation$ID == ftrID, ] | ||||
|  | ||||
| # The annotation record contains the start and end coordinates which we can use | ||||
| # to define the APSES domain sequence with a substr() expression. | ||||
|  | ||||
| (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | ||||
| (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | ||||
| (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | ||||
|                  start, | ||||
|                  end)) | ||||
|  | ||||
| # Lots of code. But don't get lost. Let's recapitulate what we have done: we | ||||
| # have selected from the sequence column of the protein table the sequence whose | ||||
| # name is "MBP1_SACCE", and selected from the annotation table the start | ||||
| # and end coordinates of the annotation that joins an "APSES fold" feature with | ||||
| # the sequence, and used the start and end coordinates to extract a substring. | ||||
|  | ||||
| # Let's convert this to an AAstring and assign it: | ||||
| aaMB1_SACCE_APSES <- Biostrings::AAString(apses) | ||||
|  | ||||
| # Now let's align these two sequences of very different length without end-gap | ||||
| # penalties using the "overlap" type. "overlap" turns the | ||||
| # end-gap penalties off and that is crucially important since | ||||
| # the sequences have very different length. | ||||
|  | ||||
| aliApses <-  Biostrings::pairwiseAlignment( | ||||
|   aaMB1_SACCE_APSES, | ||||
|   aaMBP1_MYSPE, | ||||
|   type = "overlap", | ||||
|   substitutionMatrix = "BLOSUM62", | ||||
|   gapOpening = 10, | ||||
|   gapExtension = 0.5) | ||||
|  | ||||
| # Inspect the result. The aligned sequences should be clearly | ||||
| # homologous, and have (almost) no indels. The entire "pattern" | ||||
| # sequence from QIYSAR ... to ... KPLFDF  should be matched | ||||
| # with the "query". Is this correct? | ||||
| Biostrings::writePairwiseAlignments(aliApses) | ||||
|  | ||||
| # If this is correct, you can extract the matched sequence from | ||||
| # the alignment object. The syntax is a bit different from what | ||||
| # you have seen before: this is an "S4 object", not a list. No | ||||
| # worries: as.character() returns a normal string. | ||||
| as.character(aliApses@subject) | ||||
|  | ||||
| # Now, what are the aligned start and end coordinates? You can read them from | ||||
| # the output of writePairwiseAlignments(), or you can get them from the range of | ||||
| # the match. | ||||
|  | ||||
| str(aliApses@subject@range) | ||||
|  | ||||
| # start is: | ||||
| aliApses@subject@range@start | ||||
|  | ||||
| # ... and end is: | ||||
| aliApses@subject@range@start + aliApses@subject@range@width - 1 | ||||
|  | ||||
|  | ||||
| # =    4  Update your database script  ========================================= | ||||
|  | ||||
|  | ||||
| # Since we have this feature defined now, we can create a feature annotation | ||||
| # right away and store it in myDB. | ||||
|  | ||||
| # ==   4.1  Preparing an annotation file ...  ================================== | ||||
| # | ||||
| # ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit | ||||
| # | ||||
| # | ||||
| #   You DON'T already have a file called "<MYSPE>-Annotations.json" in the | ||||
| #   ./myScripts/ directory: | ||||
| # | ||||
| #   - Make a copy of the file "./data/refAnnotations.json" and put it in your | ||||
| #     myScripts/ directory. | ||||
| # | ||||
| #   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. | ||||
| #     if MYSPE is called "Crptycoccus neoformans", your file should be called | ||||
| #     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is | ||||
| #     "MBP1_CRYNE"). | ||||
| # | ||||
| #   - Open the file in the RStudio editor and delete all blocks for | ||||
| #     the Mbp1 protein annotations except the first one. | ||||
| # | ||||
| #   - From that block, delete all lines except for the line that says: | ||||
| # | ||||
| # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, | ||||
| # | ||||
| #   - Then delete the comma at the end of the line (your file will just have | ||||
| #     this one annotation). | ||||
| # | ||||
| #   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the | ||||
| #     "start" and "end" features to the coordinates you just discovered for the | ||||
| #     APSES domain in your sequence. | ||||
| # | ||||
| #   - Save the file in your myScripts/ directory | ||||
| # | ||||
| ##   - Validate your file online at https://jsonlint.com/ | ||||
| # | ||||
| #   - Update your "./myScripts/makeProteinDB.R" script to load your new | ||||
| #     annotation when you recreate the database. Open the script in the | ||||
| #     RStudio editor, and add the following command at the end: | ||||
| # | ||||
| #     myDB <- dbAddAnnotation(myDB, | ||||
| #                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) | ||||
| #                                                 ^^^^^^^ | ||||
| #                                                edit this! | ||||
| #   - save and close the file. | ||||
| # | ||||
| # Then SKIP the next section. | ||||
| # | ||||
| # | ||||
| # ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit     | ||||
| # | ||||
| # | ||||
| #   You DO already have a file called "<MYSPE>-Annotations.json" in the | ||||
| #   ./myScripts/ directory: | ||||
| # | ||||
| #   - Open the file in the RStudio editor. | ||||
| # | ||||
| #   - Below the last feature lines (but before the closing "]") add the | ||||
| #     following feature line (without the "#") | ||||
| # | ||||
| # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"} | ||||
| # | ||||
| #   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the | ||||
| #     "start" and "end" features to the coordinates you just discovered for the | ||||
| #     APSES domain in your sequence. | ||||
| # | ||||
| #   - Add a comma after the preceding feature line. | ||||
| # | ||||
| #   - Save your file. | ||||
| # | ||||
| #   - Validate your file online at https://jsonlint.com/ | ||||
| # | ||||
| # | ||||
| # ==   4.2  Execute and Validate  ============================================== | ||||
| # | ||||
| #   - source() your database creation script: | ||||
| # | ||||
| #  source("./myScripts/makeProteinDB.R") | ||||
| # | ||||
| #     This should run without errors or warnings. If it doesn't work and you | ||||
| #     can't figure out quickly what's happening, ask on the mailing list for | ||||
| #     help. | ||||
| # | ||||
| #   - Confirm | ||||
| #     The following commands should retrieve the correct start and end | ||||
| #     coordinates and sequence of the MBP1_MYSPE APSES domain: | ||||
|  | ||||
| sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")) | ||||
|  | ||||
| (proID <- myDB$protein$ID[sel]) | ||||
| (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||
| (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | ||||
|                              myDB$annotation$featureID == ftrID]) | ||||
| (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | ||||
| (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | ||||
| (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | ||||
|                  start, | ||||
|                  end)) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-ALI-Optimal_sequence_alignment.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit. | ||||
| # | ||||
| # ============================================================================== | ||||
| # Version:  1.7.1 | ||||
| # | ||||
| # Date:     2017-09   -   2020-10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/ | ||||
| #           1.7    2020 updates | ||||
| #           1.6    Maintenance | ||||
| #           1.5    Change from require() to requireNamespace(), | ||||
| #                    use <package>::<function>() idiom throughout | ||||
| #           1.4    Pull s2c() from seqinr package, rather then loading the | ||||
| #                    entire library. | ||||
| #           1.3    Updated confirmation task with correct logic | ||||
| #           1.2    Added missing load of seqinr package | ||||
| #           1.1    Update annotation file logic - it could already have been | ||||
| #                    prepared in the BIN-FUNC-Annotation unit. | ||||
| #           1.0.1  bugfix | ||||
| #           1.0    First 2017 live version. | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                                      Line | ||||
| #TOC> -------------------------------------------------------------------------- | ||||
| #TOC>   1        Prepare                                                      58 | ||||
| #TOC>   2        Biostrings Pairwise Alignment                                75 | ||||
| #TOC>   2.1        Optimal global alignment                                   93 | ||||
| #TOC>   2.2        Optimal local alignment                                   156 | ||||
| #TOC>   3        APSES Domain annotation by alignment                        180 | ||||
| #TOC>   4        Update your database script                                 261 | ||||
| #TOC>   4.1        Preparing an annotation file ...                          267 | ||||
| #TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269 | ||||
| #TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314 | ||||
| #TOC>   4.2        Execute and Validate                                      338 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Prepare  ============================================================= | ||||
|  | ||||
| if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
| # You can get package information with the following commands: | ||||
| # library(help = seqinr)       # basic information | ||||
| # browseVignettes("seqinr")    # available vignettes | ||||
| # data(package = "seqinr")     # available datasets | ||||
|  | ||||
|  | ||||
| # You need to recreate the protein database that you have constructed in the | ||||
| # BIN-Storing_data unit. | ||||
|  | ||||
| source("./myScripts/makeProteinDB.R") | ||||
|  | ||||
|  | ||||
| # =    2  Biostrings Pairwise Alignment  ======================================= | ||||
|  | ||||
|  | ||||
| if (!requireNamespace("BiocManager", quietly=TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (!requireNamespace("Biostrings", quietly=TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # Biostrings stores sequences in "XString" objects. Once we have converted our | ||||
| # target sequences to AAString objects, the alignment itself is straightforward. | ||||
|  | ||||
| # ==   2.1  Optimal global alignment  ========================================== | ||||
|  | ||||
| # The pairwiseAlignment() function was written to behave | ||||
| # exactly like the functions you encountered on the EMBOSS server. | ||||
|  | ||||
| # First: make AAString objects ... | ||||
| sel <- myDB$protein$name == "MBP1_SACCE" | ||||
| aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel]) | ||||
|  | ||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||
| aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel]) | ||||
|  | ||||
| ?pairwiseAlignment | ||||
| # ... and align. | ||||
| # Global optimal alignment with end-gap penalties is default. | ||||
| ali1 <-  Biostrings::pairwiseAlignment( | ||||
|   aaMBP1_SACCE, | ||||
|   aaMBP1_MYSPE, | ||||
|   substitutionMatrix = "BLOSUM62", | ||||
|   gapOpening = 10, | ||||
|   gapExtension = 0.5) | ||||
|  | ||||
| str(ali1)  # ... it's complicated | ||||
|  | ||||
| # This is a Biostrings alignment object. But we can use Biostrings functions to | ||||
| # tame it: | ||||
| ali1 | ||||
| Biostrings::writePairwiseAlignments(ali1)   # That should look familiar | ||||
|  | ||||
| # And we can make the internal structure work for us  (@ is for classes as | ||||
| # $ is for lists ...) | ||||
| str(ali1@pattern) | ||||
| ali1@pattern | ||||
| ali1@pattern@range | ||||
| ali1@pattern@indel | ||||
| ali1@pattern@mismatch | ||||
|  | ||||
| # or work with "normal" R functions | ||||
| # the alignment length | ||||
| nchar(as.character(ali1@pattern)) | ||||
|  | ||||
| # the number of identities | ||||
| sum(seqinr::s2c(as.character(ali1@pattern)) == | ||||
|     seqinr::s2c(as.character(ali1@subject))) | ||||
|  | ||||
| # ... e.g. to calculate the percentage of identities | ||||
| 100 * | ||||
|   sum(seqinr::s2c(as.character(ali1@pattern)) == | ||||
|       seqinr::s2c(as.character(ali1@subject))) / | ||||
|   nchar(as.character(ali1@pattern)) | ||||
| # ... which should be the same as reported in the writePairwiseAlignments() | ||||
| # output. Awkward to type? Then it calls for a function: | ||||
| # | ||||
| percentID <- function(al) { | ||||
|   # returns the percent-identity of a Biostrings alignment object | ||||
|   return(100 * | ||||
|          sum(seqinr::s2c(as.character(al@pattern)) == | ||||
|              seqinr::s2c(as.character(al@subject))) / | ||||
|          nchar(as.character(al@pattern))) | ||||
| } | ||||
|  | ||||
| percentID(ali1) | ||||
|  | ||||
| # ==   2.2  Optimal local alignment  =========================================== | ||||
|  | ||||
| # Compare with local optimal alignment (like EMBOSS Water) | ||||
| ali2 <-  Biostrings::pairwiseAlignment( | ||||
|   aaMBP1_SACCE, | ||||
|   aaMBP1_MYSPE, | ||||
|   type = "local", | ||||
|   substitutionMatrix = "BLOSUM62", | ||||
|   gapOpening = 50, | ||||
|   gapExtension = 10) | ||||
|  | ||||
| Biostrings::writePairwiseAlignments(ali2) | ||||
| # This has probably only aligned the N-terminal DNA binding domain - but that | ||||
| # one has quite high sequence identity: | ||||
| percentID(ali2) | ||||
|  | ||||
| # == TASK: == | ||||
|  | ||||
| # Compare the two alignments. I have weighted the local alignment heavily | ||||
| # towards an ungapped alignment by setting very high gap penalties. Try changing | ||||
| # the gap penalties and see what happens: how does the number of indels change, | ||||
| # how does the length of indels change... | ||||
|  | ||||
|  | ||||
| # =    3  APSES Domain annotation by alignment  ================================ | ||||
|  | ||||
| # In this section we define the MYSPE APSES sequence by performing a global, | ||||
| # optimal sequence alignment of the yeast APSES domain with the full length | ||||
| # protein sequence of the protein that was the most similar to the yeast APSES | ||||
| # domain. | ||||
| # | ||||
|  | ||||
| # I have annotated the yeast APSES domain as a feature in the | ||||
| # database. To view the annotation, we can retrieve it via the proteinID and | ||||
| # featureID. Here is the yeast protein ID: | ||||
| (proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"]) | ||||
|  | ||||
|  | ||||
| # ... and if you look at the feature table, you can identify the feature ID | ||||
| (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||
|  | ||||
| # ... and with the two annotations we can get the corresponding ID from the | ||||
| # annotation table | ||||
| (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | ||||
|                              myDB$annotation$featureID == ftrID]) | ||||
|  | ||||
| myDB$annotation[myDB$annotation$ID == proID & | ||||
|                 myDB$annotation$ID == ftrID, ] | ||||
|  | ||||
| # The annotation record contains the start and end coordinates which we can use | ||||
| # to define the APSES domain sequence with a substr() expression. | ||||
|  | ||||
| (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | ||||
| (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | ||||
| (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | ||||
|                  start, | ||||
|                  end)) | ||||
|  | ||||
| # Lots of code. But don't get lost. Let's recapitulate what we have done: we | ||||
| # have selected from the sequence column of the protein table the sequence whose | ||||
| # name is "MBP1_SACCE", and selected from the annotation table the start | ||||
| # and end coordinates of the annotation that joins an "APSES fold" feature with | ||||
| # the sequence, and used the start and end coordinates to extract a substring. | ||||
|  | ||||
| # Let's convert this to an AAstring and assign it: | ||||
| aaMB1_SACCE_APSES <- Biostrings::AAString(apses) | ||||
|  | ||||
| # Now let's align these two sequences of very different length without end-gap | ||||
| # penalties using the "overlap" type. "overlap" turns the | ||||
| # end-gap penalties off and that is crucially important since | ||||
| # the sequences have very different length. | ||||
|  | ||||
| aliApses <-  Biostrings::pairwiseAlignment( | ||||
|   aaMB1_SACCE_APSES, | ||||
|   aaMBP1_MYSPE, | ||||
|   type = "overlap", | ||||
|   substitutionMatrix = "BLOSUM62", | ||||
|   gapOpening = 10, | ||||
|   gapExtension = 0.5) | ||||
|  | ||||
| # Inspect the result. The aligned sequences should be clearly | ||||
| # homologous, and have (almost) no indels. The entire "pattern" | ||||
| # sequence from QIYSAR ... to ... KPLFDF  should be matched | ||||
| # with the "query". Is this correct? | ||||
| Biostrings::writePairwiseAlignments(aliApses) | ||||
|  | ||||
| # If this is correct, you can extract the matched sequence from | ||||
| # the alignment object. The syntax is a bit different from what | ||||
| # you have seen before: this is an "S4 object", not a list. No | ||||
| # worries: as.character() returns a normal string. | ||||
| as.character(aliApses@subject) | ||||
|  | ||||
| # Now, what are the aligned start and end coordinates? You can read them from | ||||
| # the output of writePairwiseAlignments(), or you can get them from the range of | ||||
| # the match. | ||||
|  | ||||
| str(aliApses@subject@range) | ||||
|  | ||||
| # start is: | ||||
| aliApses@subject@range@start | ||||
|  | ||||
| # ... and end is: | ||||
| aliApses@subject@range@start + aliApses@subject@range@width - 1 | ||||
|  | ||||
|  | ||||
| # =    4  Update your database script  ========================================= | ||||
|  | ||||
|  | ||||
| # Since we have this feature defined now, we can create a feature annotation | ||||
| # right away and store it in myDB. | ||||
|  | ||||
| # ==   4.1  Preparing an annotation file ...  ================================== | ||||
| # | ||||
| # ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit | ||||
| # | ||||
| # | ||||
| #   You DON'T already have a file called "<MYSPE>-Annotations.json" in the | ||||
| #   ./myScripts/ directory: | ||||
| # | ||||
| #   - Make a copy of the file "./data/refAnnotations.json" and put it in your | ||||
| #     myScripts/ directory. | ||||
| # | ||||
| #   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. | ||||
| #     if MYSPE is called "Crptycoccus neoformans", your file should be called | ||||
| #     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is | ||||
| #     "MBP1_CRYNE"). | ||||
| # | ||||
| #   - Open the file in the RStudio editor and delete all blocks for | ||||
| #     the Mbp1 protein annotations except the first one. | ||||
| # | ||||
| #   - From that block, delete all lines except for the line that says: | ||||
| # | ||||
| # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, | ||||
| # | ||||
| #   - Then delete the comma at the end of the line (your file will just have | ||||
| #     this one annotation). | ||||
| # | ||||
| #   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the | ||||
| #     "start" and "end" features to the coordinates you just discovered for the | ||||
| #     APSES domain in your sequence. | ||||
| # | ||||
| #   - Save the file in your myScripts/ directory | ||||
| # | ||||
| ##   - Validate your file online at https://jsonlint.com/ | ||||
| # | ||||
| #   - Update your "./myScripts/makeProteinDB.R" script to load your new | ||||
| #     annotation when you recreate the database. Open the script in the | ||||
| #     RStudio editor, and add the following command at the end: | ||||
| # | ||||
| #     myDB <- dbAddAnnotation(myDB, | ||||
| #                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) | ||||
| #                                                 ^^^^^^^ | ||||
| #                                                edit this! | ||||
| #   - save and close the file. | ||||
| # | ||||
| # Then SKIP the next section. | ||||
| # | ||||
| # | ||||
| # ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit     | ||||
| # | ||||
| # | ||||
| #   You DO already have a file called "<MYSPE>-Annotations.json" in the | ||||
| #   ./myScripts/ directory: | ||||
| # | ||||
| #   - Open the file in the RStudio editor. | ||||
| # | ||||
| #   - Below the last feature lines (but before the closing "]") add the | ||||
| #     following feature line (without the "#") | ||||
| # | ||||
| # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"} | ||||
| # | ||||
| #   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the | ||||
| #     "start" and "end" features to the coordinates you just discovered for the | ||||
| #     APSES domain in your sequence. | ||||
| # | ||||
| #   - Add a comma after the preceding feature line. | ||||
| # | ||||
| #   - Save your file. | ||||
| # | ||||
| #   - Validate your file online at https://jsonlint.com/ | ||||
| # | ||||
| # | ||||
| # ==   4.2  Execute and Validate  ============================================== | ||||
| # | ||||
| #   - source() your database creation script: | ||||
| # | ||||
| #  source("./myScripts/makeProteinDB.R") | ||||
| # | ||||
| #     This should run without errors or warnings. If it doesn't work and you | ||||
| #     can't figure out quickly what's happening, ask on the mailing list for | ||||
| #     help. | ||||
| # | ||||
| #   - Confirm | ||||
| #     The following commands should retrieve the correct start and end | ||||
| #     coordinates and sequence of the MBP1_MYSPE APSES domain: | ||||
|  | ||||
| sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")) | ||||
|  | ||||
| (proID <- myDB$protein$ID[sel]) | ||||
| (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||
| (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | ||||
|                              myDB$annotation$featureID == ftrID]) | ||||
| (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | ||||
| (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | ||||
| (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | ||||
|                  start, | ||||
|                  end)) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,313 +1,313 @@ | ||||
| # tocID <- "BIN-ALI-Similarity.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-ALI-Similarity unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Updates | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0    Refactored for 2017; add aaindex, ternary plot. | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #   Update ggtern:: ternary plot to use aacol dots under text | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                          Line | ||||
| #TOC> ---------------------------------------------- | ||||
| #TOC>   1        Amino Acid Properties            43 | ||||
| #TOC>   2        Mutation Data matrix            189 | ||||
| #TOC>   3        Background score                230 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Amino Acid Properties  =============================================== | ||||
|  | ||||
| # A large collection of amino acid property tables is available via the seqinr | ||||
| # package: | ||||
|  | ||||
| if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = seqinr)       # basic information | ||||
| #  browseVignettes("seqinr")    # available vignettes | ||||
| #  data(package = "seqinr")     # available datasets | ||||
|  | ||||
| # A true Labor of Love has gone into the compilation of the seqinr "aaindex" | ||||
| #  data: | ||||
|  | ||||
| ?aaindex | ||||
| data(aaindex, package = "seqinr")  # load the aaindex list from the package | ||||
|  | ||||
| length(aaindex) | ||||
|  | ||||
| # Here are all the index descriptions | ||||
| for (i in 1:length(aaindex)) { | ||||
|   cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) | ||||
| } | ||||
|  | ||||
| # It's a bit cumbersome to search through the descriptions ... here is a | ||||
| # function to make this easier: | ||||
|  | ||||
| searchAAindex <- function(patt) { | ||||
|   # Searches the aaindex descriptions for regular expression "patt" | ||||
|   # and prints index number and description. | ||||
|   hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0)) | ||||
|   for (i in seq_along(hits)) { | ||||
|     cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D)) | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| searchAAindex("free energy")          # Search for "free energy" | ||||
| searchAAindex("(size)|(volume)")      # Search for "size" or "volume": | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # Let's examine ... | ||||
| # ... a hydrophobicity index | ||||
| (Y <- aaindex[[528]][c("D", "I")]) | ||||
|  | ||||
| # ... a volume index | ||||
| (V <- aaindex[[150]][c("D", "I")]) | ||||
|  | ||||
| # ... and one of our own: side-chain pK values as reported by | ||||
| # Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set | ||||
| # to 7.4 (physiological pH) | ||||
| K <- list(I = c( 7.4,   # Ala | ||||
|                 12.3,   # Arg | ||||
|                  7.4,   # Asn | ||||
|                  3.9,   # Asp | ||||
|                  8.6,   # Cys | ||||
|                  7.4,   # Gln | ||||
|                  4.3,   # Glu | ||||
|                  7.4,   # Gly | ||||
|                  6.5,   # His | ||||
|                  7.4,   # Ile | ||||
|                  7.4,   # Leu | ||||
|                 10.4,   # Lys | ||||
|                  7.4,   # Met | ||||
|                  7.4,   # Phe | ||||
|                  7.4,   # Pro | ||||
|                  7.4,   # Ser | ||||
|                  7.4,   # Thr | ||||
|                  7.4,   # Trp | ||||
|                  9.8,   # Tyr | ||||
|                  7.4))  # Val | ||||
| names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile", | ||||
|                 "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val") | ||||
|  | ||||
|  | ||||
| # Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ... | ||||
|  | ||||
| # pull the names from Y$I, convert them to single letter code, and reorder the | ||||
| # AACOLS palette accordingly ... | ||||
| aac <- AACOLS[toupper(seqinr::a(names(Y$I)))] | ||||
|  | ||||
| plot(Y$I, V$I, | ||||
|      xlab = "hydrophobicity", ylab = "volume", | ||||
|      pch = 21, | ||||
|      cex = 6, | ||||
|      col = aac, | ||||
|      bg  = aac) | ||||
| text(Y$I, V$I, names(Y$I), cex = 0.8) | ||||
|  | ||||
| plot(Y$I, K$I, | ||||
|      xlab = "hydrophobicity", ylab = "pK", | ||||
|      pch = 21, | ||||
|      cex = 6, | ||||
|      col = aac, | ||||
|      bg  = aac) | ||||
| text(Y$I, K$I, names(Y$I), cex = 0.8) | ||||
|  | ||||
| # ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such | ||||
| # plots are in general unintuitive and hard to interpret. One alternative is a | ||||
| # so-called "ternary plot": | ||||
|  | ||||
| if (! requireNamespace("ggtern", quietly=TRUE)) { | ||||
|   install.packages("ggtern") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = ggtern)       # basic information | ||||
| #  browseVignettes("ggtern")    # available vignettes | ||||
| #  data(package = "ggtern")     # available datasets | ||||
|  | ||||
|  | ||||
|  | ||||
| # collect into data frame, normalize to (0.05, 0.95) | ||||
| myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05, | ||||
|                     "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05, | ||||
|                     "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05, | ||||
|                     stringsAsFactors = FALSE) | ||||
| rownames(myDat) <- names(Y$I) | ||||
|  | ||||
| ggtern::ggtern(data = myDat, | ||||
|                ggplot2::aes(x = vol, | ||||
|                    y = phi, | ||||
|                    z = pK, | ||||
|                    label = rownames(myDat))) + ggplot2::geom_text() | ||||
|  | ||||
| # This results in a mapping of amino acids relative to each other that is | ||||
| # similar to the Venn diagram you have seen in the notes. | ||||
|  | ||||
| # ... or we could use principal components analysis, to pull out the | ||||
| # best projection of the three feature dimensions into two. (Done here without delving | ||||
| # into the theory ...) | ||||
| prc <- prcomp(myDat) | ||||
| plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n", | ||||
|      pch=19, cex=6, col=aad, cex.main=0.7, | ||||
|      main="Principal Component Analysis of Amino Acid Features") | ||||
| text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088") | ||||
|  | ||||
| # This matches the intuition rather well in that "similar" amino acids are close | ||||
| # on the plot. But we can't interpret the distances in terms of just one of the | ||||
| # parameters. Whatever - nature has a different way to define similarity: | ||||
| # mutations to similar amino acids are less likely to break the protein. | ||||
|  | ||||
|  | ||||
| # =    2  Mutation Data matrix  ================================================ | ||||
|  | ||||
| # A mutation data matrix encodes all amino acid pairscores in a matrix. | ||||
|  | ||||
| # The Biostrings package contains the most common mutation data matrices. | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly=TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly=TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help=Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")  # available vignettes | ||||
| #  data(package = "Biostrings")   # available datasets | ||||
|  | ||||
| # Let's attach the BLOSUM62 mutation data matrix from the package | ||||
| data(BLOSUM62, package = "Biostrings") | ||||
|  | ||||
| # ... and see what it contains. (You've seen this matrix before.) | ||||
| BLOSUM62 | ||||
|  | ||||
| # We can simply access values via the row/column names. | ||||
| # Identical amino acids have high scores ... | ||||
| BLOSUM62["H", "H"]   # Score for a pair of two histidines | ||||
| BLOSUM62["S", "S"]   # Score for a pair of two serines | ||||
|  | ||||
| # Similar amino acids have low positive scores ... | ||||
| BLOSUM62["L", "I"]   # Score for a leucine / lysine pair | ||||
| BLOSUM62["F", "Y"]   # etc. | ||||
|  | ||||
| # Dissimilar amino acids have negative scores ... | ||||
| BLOSUM62["L", "K"]   # Score for a leucine / lysine pair | ||||
| BLOSUM62["Q", "P"]   # etc. | ||||
|  | ||||
|  | ||||
| BLOSUM62["R", "W"]   # the matrix is symmetric! | ||||
| BLOSUM62["W", "R"] | ||||
|  | ||||
|  | ||||
| # =    3  Background score  ==================================================== | ||||
|  | ||||
| # The mutation data matrix is designed to give high scores to homologous | ||||
| # sequences, low scores to non-homologous sequences. What score on average | ||||
| # should we expect for a random sequence? | ||||
|  | ||||
| # If we sample amino acid pairs at random, we will get a score that is the | ||||
| # average of the individual pairscores in the matrix. Omitting the ambiguity | ||||
| # codes and the gap character: | ||||
|  | ||||
| sum(BLOSUM62[1:20, 1:20])/400 | ||||
|  | ||||
| # But that score could be higher for real sequences, for which the amino acid | ||||
| # distribution is not random. For example membrane proteins have a large number | ||||
| # of hydrophobic residues - an alignment of unrelated proteins might produce | ||||
| # positive scores. And there are other proteins with biased amino acid | ||||
| # compositions, in particular poteins that interact with multiple other | ||||
| # proteins. Let's test how this impacts the background score by comparing a | ||||
| # sequence with shuffled sequences. These have the same composition, but are | ||||
| # obvioulsy not homologous. The data directory contains the FASTA file for the | ||||
| # PDB ID 3FG7 - a villin headpiece structure with a large amount of | ||||
| # low-complexity amino acid sequence ... | ||||
|  | ||||
| aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]] | ||||
|  | ||||
| # ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C) | ||||
| # with an exceptionally high percentage of hydrophobic residues. | ||||
|  | ||||
| aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]] | ||||
|  | ||||
| # Here is a function that takes two sequences and | ||||
| # returns their average pairscore. | ||||
|  | ||||
| averagePairScore <- function(a, b, MDM = BLOSUM62) { | ||||
|   # Returns average pairscore of two sequences. | ||||
|   # Parameters: | ||||
|   #    a, b   chr   amino acid sequence string | ||||
|   #    MDM          mutation data matrix. Default is BLOSUM62 | ||||
|   # Value:    num   average pairscore. | ||||
|   a <- unlist(strsplit(a, "")) | ||||
|   b <- unlist(strsplit(b, "")) | ||||
|   v <- 0 | ||||
|   for (i in seq_along(a)) { | ||||
|     v <- v + MDM[ a[i], b[i] ] | ||||
|   } | ||||
|   return(v / length(a)) | ||||
| } | ||||
|  | ||||
| orig3FG7 <- toString(aa3FG7) | ||||
| orig2F1C <- toString(aa2F1C) | ||||
| N <- 1000 | ||||
| scores3FG7 <- numeric(N) | ||||
| scores2F1C <- numeric(N) | ||||
| for (i in 1:N) { | ||||
|   scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7))) | ||||
|   scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C))) | ||||
| } | ||||
|  | ||||
| # Plot the distributions | ||||
| hist(scores3FG7, | ||||
|      col="#5599EE33", | ||||
|      breaks = seq(-1.5, 0, by=0.1), | ||||
|      main = "Pairscores for randomly shuffled sequences", | ||||
|      xlab = "Average pairscore from BLOSUM 62") | ||||
| hist(scores2F1C, | ||||
|      col="#55EE9933", | ||||
|      breaks = seq(-1.5, 0, by=0.1), | ||||
|      add = TRUE) | ||||
| abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2) | ||||
| legend('topright', | ||||
|        c("3FG7 (villin)", "2F1C (OmpG)"), | ||||
|        fill = c("#5599EE33", "#55EE9933"), bty = 'n', | ||||
|        inset = 0.1) | ||||
|  | ||||
| # This is an important result: even though we have shuffled significantly biased | ||||
| # sequences, and the average scores trend above the average of the mutation data | ||||
| # matrix, the average scores still remain comfortably below zero. This means | ||||
| # that we can't (in general) improve a high-scoring alignment by simply | ||||
| # extending it with randomly matched residues. We will only improve the score if | ||||
| # the similarity of newly added residues is larger than what we expect to get by | ||||
| # random chance! | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-ALI-Similarity.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-ALI-Similarity unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Updates | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0    Refactored for 2017; add aaindex, ternary plot. | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #   Update ggtern:: ternary plot to use aacol dots under text | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                          Line | ||||
| #TOC> ---------------------------------------------- | ||||
| #TOC>   1        Amino Acid Properties            43 | ||||
| #TOC>   2        Mutation Data matrix            189 | ||||
| #TOC>   3        Background score                230 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Amino Acid Properties  =============================================== | ||||
|  | ||||
| # A large collection of amino acid property tables is available via the seqinr | ||||
| # package: | ||||
|  | ||||
| if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = seqinr)       # basic information | ||||
| #  browseVignettes("seqinr")    # available vignettes | ||||
| #  data(package = "seqinr")     # available datasets | ||||
|  | ||||
| # A true Labor of Love has gone into the compilation of the seqinr "aaindex" | ||||
| #  data: | ||||
|  | ||||
| ?aaindex | ||||
| data(aaindex, package = "seqinr")  # load the aaindex list from the package | ||||
|  | ||||
| length(aaindex) | ||||
|  | ||||
| # Here are all the index descriptions | ||||
| for (i in 1:length(aaindex)) { | ||||
|   cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) | ||||
| } | ||||
|  | ||||
| # It's a bit cumbersome to search through the descriptions ... here is a | ||||
| # function to make this easier: | ||||
|  | ||||
| searchAAindex <- function(patt) { | ||||
|   # Searches the aaindex descriptions for regular expression "patt" | ||||
|   # and prints index number and description. | ||||
|   hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0)) | ||||
|   for (i in seq_along(hits)) { | ||||
|     cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D)) | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
| searchAAindex("free energy")          # Search for "free energy" | ||||
| searchAAindex("(size)|(volume)")      # Search for "size" or "volume": | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # Let's examine ... | ||||
| # ... a hydrophobicity index | ||||
| (Y <- aaindex[[528]][c("D", "I")]) | ||||
|  | ||||
| # ... a volume index | ||||
| (V <- aaindex[[150]][c("D", "I")]) | ||||
|  | ||||
| # ... and one of our own: side-chain pK values as reported by | ||||
| # Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set | ||||
| # to 7.4 (physiological pH) | ||||
| K <- list(I = c( 7.4,   # Ala | ||||
|                 12.3,   # Arg | ||||
|                  7.4,   # Asn | ||||
|                  3.9,   # Asp | ||||
|                  8.6,   # Cys | ||||
|                  7.4,   # Gln | ||||
|                  4.3,   # Glu | ||||
|                  7.4,   # Gly | ||||
|                  6.5,   # His | ||||
|                  7.4,   # Ile | ||||
|                  7.4,   # Leu | ||||
|                 10.4,   # Lys | ||||
|                  7.4,   # Met | ||||
|                  7.4,   # Phe | ||||
|                  7.4,   # Pro | ||||
|                  7.4,   # Ser | ||||
|                  7.4,   # Thr | ||||
|                  7.4,   # Trp | ||||
|                  9.8,   # Tyr | ||||
|                  7.4))  # Val | ||||
| names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile", | ||||
|                 "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val") | ||||
|  | ||||
|  | ||||
| # Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ... | ||||
|  | ||||
| # pull the names from Y$I, convert them to single letter code, and reorder the | ||||
| # AACOLS palette accordingly ... | ||||
| aac <- AACOLS[toupper(seqinr::a(names(Y$I)))] | ||||
|  | ||||
| plot(Y$I, V$I, | ||||
|      xlab = "hydrophobicity", ylab = "volume", | ||||
|      pch = 21, | ||||
|      cex = 6, | ||||
|      col = aac, | ||||
|      bg  = aac) | ||||
| text(Y$I, V$I, names(Y$I), cex = 0.8) | ||||
|  | ||||
| plot(Y$I, K$I, | ||||
|      xlab = "hydrophobicity", ylab = "pK", | ||||
|      pch = 21, | ||||
|      cex = 6, | ||||
|      col = aac, | ||||
|      bg  = aac) | ||||
| text(Y$I, K$I, names(Y$I), cex = 0.8) | ||||
|  | ||||
| # ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such | ||||
| # plots are in general unintuitive and hard to interpret. One alternative is a | ||||
| # so-called "ternary plot": | ||||
|  | ||||
| if (! requireNamespace("ggtern", quietly=TRUE)) { | ||||
|   install.packages("ggtern") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = ggtern)       # basic information | ||||
| #  browseVignettes("ggtern")    # available vignettes | ||||
| #  data(package = "ggtern")     # available datasets | ||||
|  | ||||
|  | ||||
|  | ||||
| # collect into data frame, normalize to (0.05, 0.95) | ||||
| myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05, | ||||
|                     "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05, | ||||
|                     "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05, | ||||
|                     stringsAsFactors = FALSE) | ||||
| rownames(myDat) <- names(Y$I) | ||||
|  | ||||
| ggtern::ggtern(data = myDat, | ||||
|                ggplot2::aes(x = vol, | ||||
|                    y = phi, | ||||
|                    z = pK, | ||||
|                    label = rownames(myDat))) + ggplot2::geom_text() | ||||
|  | ||||
| # This results in a mapping of amino acids relative to each other that is | ||||
| # similar to the Venn diagram you have seen in the notes. | ||||
|  | ||||
| # ... or we could use principal components analysis, to pull out the | ||||
| # best projection of the three feature dimensions into two. (Done here without delving | ||||
| # into the theory ...) | ||||
| prc <- prcomp(myDat) | ||||
| plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n", | ||||
|      pch=19, cex=6, col=aad, cex.main=0.7, | ||||
|      main="Principal Component Analysis of Amino Acid Features") | ||||
| text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088") | ||||
|  | ||||
| # This matches the intuition rather well in that "similar" amino acids are close | ||||
| # on the plot. But we can't interpret the distances in terms of just one of the | ||||
| # parameters. Whatever - nature has a different way to define similarity: | ||||
| # mutations to similar amino acids are less likely to break the protein. | ||||
|  | ||||
|  | ||||
| # =    2  Mutation Data matrix  ================================================ | ||||
|  | ||||
| # A mutation data matrix encodes all amino acid pairscores in a matrix. | ||||
|  | ||||
| # The Biostrings package contains the most common mutation data matrices. | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly=TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly=TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help=Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")  # available vignettes | ||||
| #  data(package = "Biostrings")   # available datasets | ||||
|  | ||||
| # Let's attach the BLOSUM62 mutation data matrix from the package | ||||
| data(BLOSUM62, package = "Biostrings") | ||||
|  | ||||
| # ... and see what it contains. (You've seen this matrix before.) | ||||
| BLOSUM62 | ||||
|  | ||||
| # We can simply access values via the row/column names. | ||||
| # Identical amino acids have high scores ... | ||||
| BLOSUM62["H", "H"]   # Score for a pair of two histidines | ||||
| BLOSUM62["S", "S"]   # Score for a pair of two serines | ||||
|  | ||||
| # Similar amino acids have low positive scores ... | ||||
| BLOSUM62["L", "I"]   # Score for a leucine / lysine pair | ||||
| BLOSUM62["F", "Y"]   # etc. | ||||
|  | ||||
| # Dissimilar amino acids have negative scores ... | ||||
| BLOSUM62["L", "K"]   # Score for a leucine / lysine pair | ||||
| BLOSUM62["Q", "P"]   # etc. | ||||
|  | ||||
|  | ||||
| BLOSUM62["R", "W"]   # the matrix is symmetric! | ||||
| BLOSUM62["W", "R"] | ||||
|  | ||||
|  | ||||
| # =    3  Background score  ==================================================== | ||||
|  | ||||
| # The mutation data matrix is designed to give high scores to homologous | ||||
| # sequences, low scores to non-homologous sequences. What score on average | ||||
| # should we expect for a random sequence? | ||||
|  | ||||
| # If we sample amino acid pairs at random, we will get a score that is the | ||||
| # average of the individual pairscores in the matrix. Omitting the ambiguity | ||||
| # codes and the gap character: | ||||
|  | ||||
| sum(BLOSUM62[1:20, 1:20])/400 | ||||
|  | ||||
| # But that score could be higher for real sequences, for which the amino acid | ||||
| # distribution is not random. For example membrane proteins have a large number | ||||
| # of hydrophobic residues - an alignment of unrelated proteins might produce | ||||
| # positive scores. And there are other proteins with biased amino acid | ||||
| # compositions, in particular poteins that interact with multiple other | ||||
| # proteins. Let's test how this impacts the background score by comparing a | ||||
| # sequence with shuffled sequences. These have the same composition, but are | ||||
| # obvioulsy not homologous. The data directory contains the FASTA file for the | ||||
| # PDB ID 3FG7 - a villin headpiece structure with a large amount of | ||||
| # low-complexity amino acid sequence ... | ||||
|  | ||||
| aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]] | ||||
|  | ||||
| # ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C) | ||||
| # with an exceptionally high percentage of hydrophobic residues. | ||||
|  | ||||
| aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]] | ||||
|  | ||||
| # Here is a function that takes two sequences and | ||||
| # returns their average pairscore. | ||||
|  | ||||
| averagePairScore <- function(a, b, MDM = BLOSUM62) { | ||||
|   # Returns average pairscore of two sequences. | ||||
|   # Parameters: | ||||
|   #    a, b   chr   amino acid sequence string | ||||
|   #    MDM          mutation data matrix. Default is BLOSUM62 | ||||
|   # Value:    num   average pairscore. | ||||
|   a <- unlist(strsplit(a, "")) | ||||
|   b <- unlist(strsplit(b, "")) | ||||
|   v <- 0 | ||||
|   for (i in seq_along(a)) { | ||||
|     v <- v + MDM[ a[i], b[i] ] | ||||
|   } | ||||
|   return(v / length(a)) | ||||
| } | ||||
|  | ||||
| orig3FG7 <- toString(aa3FG7) | ||||
| orig2F1C <- toString(aa2F1C) | ||||
| N <- 1000 | ||||
| scores3FG7 <- numeric(N) | ||||
| scores2F1C <- numeric(N) | ||||
| for (i in 1:N) { | ||||
|   scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7))) | ||||
|   scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C))) | ||||
| } | ||||
|  | ||||
| # Plot the distributions | ||||
| hist(scores3FG7, | ||||
|      col="#5599EE33", | ||||
|      breaks = seq(-1.5, 0, by=0.1), | ||||
|      main = "Pairscores for randomly shuffled sequences", | ||||
|      xlab = "Average pairscore from BLOSUM 62") | ||||
| hist(scores2F1C, | ||||
|      col="#55EE9933", | ||||
|      breaks = seq(-1.5, 0, by=0.1), | ||||
|      add = TRUE) | ||||
| abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2) | ||||
| legend('topright', | ||||
|        c("3FG7 (villin)", "2F1C (OmpG)"), | ||||
|        fill = c("#5599EE33", "#55EE9933"), bty = 'n', | ||||
|        inset = 0.1) | ||||
|  | ||||
| # This is an important result: even though we have shuffled significantly biased | ||||
| # sequences, and the average scores trend above the average of the mutation data | ||||
| # matrix, the average scores still remain comfortably below zero. This means | ||||
| # that we can't (in general) improve a high-scoring alignment by simply | ||||
| # extending it with randomly matched residues. We will only improve the score if | ||||
| # the similarity of newly added residues is larger than what we expect to get by | ||||
| # random chance! | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,216 +1,216 @@ | ||||
| # tocID <- "BIN-Data_integration.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-Data_integration unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2018-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance and updates | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0.1  Bugfix: UniProt ID Mapping service API change | ||||
| #           1.0    First live version | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #           Develop a fungi-specific BioMart example. | ||||
| #           (cf. | ||||
| # https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html ) | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                             Line | ||||
| #TOC> ------------------------------------------------- | ||||
| #TOC>   1        Identifier mapping                  42 | ||||
| #TOC>   2        Cross-referencing tables           165 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Identifier mapping  ================================================== | ||||
|  | ||||
| # UniProt provides a well-designed ID mapping tool that can be accessed | ||||
| # online at     http://www.uniprot.org/mapping/ | ||||
| # | ||||
| # Here we will use the UniProt Web API for this tool to map identifiers. The | ||||
| # UniProt ID mapping service supports a "RESTful API": responses can be obtained | ||||
| # simply via a Web- browsers request. Such requests are commonly sent via the | ||||
| # GET or POST verbs that a Webserver responds to, when a client asks for data. | ||||
| # GET requests are visible in the URL of the request; POST requests are not | ||||
| # directly visible, they are commonly used to send the contents of forms, or | ||||
| # when transmitting larger, complex data items. The UniProt ID mapping sevice | ||||
| # can accept long lists of IDs, thus using the POST mechanism makes sense. GET() | ||||
| # and  POST() functions are part of the httr package. | ||||
|  | ||||
| # To begin, we load  httr, which supports sending and receiving data via the | ||||
| # http protocol, just like a Web browser. | ||||
| if (! requireNamespace("httr", quietly=TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = httr)       # basic information | ||||
| #  browseVignettes("httr")    # available vignettes | ||||
| #  data(package = "httr")     # available datasets | ||||
|  | ||||
|  | ||||
| # We will walk through the process with the refSeqID | ||||
| # of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what | ||||
| # happens if the ID can't be mapped: | ||||
| myQueryIDs <- "NP_010227 NP_00000 NP_011036" | ||||
|  | ||||
|  | ||||
| # The UniProt ID mapping service API is very straightforward to use: just define | ||||
| # the URL of the server and send a list of items labelled as "query" in the body | ||||
| # of the request. GET() and POST() are functions from httr. | ||||
|  | ||||
| # Note. A recent bug in the interaction between the server expectations and the | ||||
| # curl client libraries requires the following initialization | ||||
| httr::set_config(httr::config(http_version = 0)) | ||||
| # cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b | ||||
|  | ||||
|  | ||||
| URL <- "https://www.uniprot.org/mapping/" | ||||
| response <- httr::POST(URL, | ||||
|                        body = list(from = "P_REFSEQ_AC",   # Refseq Protein | ||||
|                                    to = "ACC",             # UniProt ID | ||||
|                                    format = "tab", | ||||
|                                    query = myQueryIDs)) | ||||
|  | ||||
| cat(httr::content(response)) | ||||
|  | ||||
| # We need to check the status code - if it is not 200, an error ocurred and we | ||||
| # can't process the result: | ||||
| httr::status_code(response) | ||||
|  | ||||
| # If the query is successful, tabbed text is returned. We can assign that to a | ||||
| # data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument. | ||||
|  | ||||
| myMappedIDs <- read.delim(file = textConnection(httr::content(response)), | ||||
|                           sep = "\t", | ||||
|                           stringsAsFactors = FALSE) | ||||
| myMappedIDs | ||||
|  | ||||
| # If this works as expected, you should see: | ||||
| #        From     To | ||||
| # 1 NP_010227 P39678 | ||||
| # 2 NP_011036 P25302 | ||||
| # | ||||
| # ... and note that there are only two entries, because nothing was returned | ||||
| # for the dummy "RefSeq ID" NP_00000 | ||||
|  | ||||
| # If the query can't be fulfilled because of a problem with the server, a | ||||
| # WebPage is returned. But the server status is also returned and we can check | ||||
| # the status code. I have lately gotten many "503" status codes: Server Not | ||||
| # Available... | ||||
|  | ||||
| # We wrap this into a function: | ||||
|  | ||||
| myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { | ||||
|   # Use UniProt ID mapping service to map one or more IDs | ||||
|   # Parameters: | ||||
|   #    s  char  A string of separated IDs | ||||
|   #    mapFrom  char  the database in which the IDs in s are valid. Default | ||||
|   #                     is RefSeq protein | ||||
|   #    mapTo    char  the database in which the target IDs are valid. Default | ||||
|   #                     is UniProtKB | ||||
|   # Value | ||||
|   #    a data frame of mapped IDs, with column names From and To, or an | ||||
|   #    empty data frame if the mapping was unsuccessful. No rows are returned | ||||
|   #    for IDs that are not mapped. | ||||
|  | ||||
|   # Initialize curl | ||||
|   httr::set_config(httr::config(http_version = 0)) | ||||
|  | ||||
|   URL <- "https://www.uniprot.org/uploadlists/" | ||||
|   response <- httr::POST(URL, | ||||
|                          body = list(from = mapFrom, | ||||
|                                      to = mapTo, | ||||
|                                      format = "tab", | ||||
|                                      query = s)) | ||||
|  | ||||
|   if (httr::status_code(response) == 200) { # 200: oK | ||||
|     myMap <- read.delim(file = textConnection(httr::content(response)), | ||||
|                         sep = "\t", | ||||
|                         stringsAsFactors = FALSE) | ||||
|     colnames(myMap) <- c("From", "To") | ||||
|   } else { | ||||
|     myMap <- data.frame() | ||||
|     warning(paste("No uniProt ID mapping returned:", | ||||
|                   "server sent status", | ||||
|                   httr::status_code(response))) | ||||
|   } | ||||
|  | ||||
|   return(myMap) | ||||
| } | ||||
|  | ||||
| # Try it out ... | ||||
| myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165") | ||||
|  | ||||
| # A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded | ||||
| # into your workspace on startup. | ||||
|  | ||||
|  | ||||
| # =    2  Cross-referencing tables  ============================================ | ||||
|  | ||||
| # Sometimes we get the IDs we need to map in a large table, e.g. from a list of | ||||
| # genes in a model organism database such as SGD, or from the Human Genen | ||||
| # Nomenclature commission. How do we map one set of identifiers to another one? | ||||
|  | ||||
| # The function to use is match(). | ||||
| # Here is a tiny set of identifiers taken from a much larger table to | ||||
| # illustrate the principle: | ||||
| # | ||||
|  | ||||
| myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747", | ||||
|                               "Q08641", "P47129", "P52910", "P00330", "P81450"), | ||||
|                     name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4", | ||||
|                               "AB140", "ACF4", "ACS2", "ADH1", "ATP18"), | ||||
|                     refID = c("NP_014657", "NP_009386", | ||||
|                               "NP_012683", "NP_012559", | ||||
|                               "NP_010038", "NP_014882", | ||||
|                               "NP_012616", "NP_013254", | ||||
|                               "NP_014555", "NP_013629")) | ||||
|  | ||||
| myIDs | ||||
|  | ||||
| # Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to | ||||
| # their gene names. | ||||
| myQuery <- c("NP_010038", "NP_999999", "NP_013629") | ||||
|  | ||||
| # %in% will only tell us if these IDs are present in the table: | ||||
| myQuery %in% myIDs$refID | ||||
|  | ||||
| # ... but not where they are located. But match() does what we need here: | ||||
| match(myQuery, myIDs$refID) | ||||
|  | ||||
| # ... and we can use the result to subset the column that we want to map to: | ||||
| myIDs$name[match(myQuery, myIDs$refID)] | ||||
|  | ||||
| # Note that the output preserves the NA - i.e. the length of the mapped | ||||
| # values is exactly the same as the length of the query. | ||||
|  | ||||
| # task: map the three genes to their UniProt Identifier. | ||||
|  | ||||
|  | ||||
| # | ||||
| # Note: if you want to do very many queries in very large tables, use the | ||||
| # fmatch() function in the "fastmatch" package for a considerable | ||||
| # speedup. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-Data_integration.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-Data_integration unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2018-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance and updates | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0.1  Bugfix: UniProt ID Mapping service API change | ||||
| #           1.0    First live version | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #           Develop a fungi-specific BioMart example. | ||||
| #           (cf. | ||||
| # https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html ) | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                             Line | ||||
| #TOC> ------------------------------------------------- | ||||
| #TOC>   1        Identifier mapping                  42 | ||||
| #TOC>   2        Cross-referencing tables           165 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Identifier mapping  ================================================== | ||||
|  | ||||
| # UniProt provides a well-designed ID mapping tool that can be accessed | ||||
| # online at     http://www.uniprot.org/mapping/ | ||||
| # | ||||
| # Here we will use the UniProt Web API for this tool to map identifiers. The | ||||
| # UniProt ID mapping service supports a "RESTful API": responses can be obtained | ||||
| # simply via a Web- browsers request. Such requests are commonly sent via the | ||||
| # GET or POST verbs that a Webserver responds to, when a client asks for data. | ||||
| # GET requests are visible in the URL of the request; POST requests are not | ||||
| # directly visible, they are commonly used to send the contents of forms, or | ||||
| # when transmitting larger, complex data items. The UniProt ID mapping sevice | ||||
| # can accept long lists of IDs, thus using the POST mechanism makes sense. GET() | ||||
| # and  POST() functions are part of the httr package. | ||||
|  | ||||
| # To begin, we load  httr, which supports sending and receiving data via the | ||||
| # http protocol, just like a Web browser. | ||||
| if (! requireNamespace("httr", quietly=TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = httr)       # basic information | ||||
| #  browseVignettes("httr")    # available vignettes | ||||
| #  data(package = "httr")     # available datasets | ||||
|  | ||||
|  | ||||
| # We will walk through the process with the refSeqID | ||||
| # of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what | ||||
| # happens if the ID can't be mapped: | ||||
| myQueryIDs <- "NP_010227 NP_00000 NP_011036" | ||||
|  | ||||
|  | ||||
| # The UniProt ID mapping service API is very straightforward to use: just define | ||||
| # the URL of the server and send a list of items labelled as "query" in the body | ||||
| # of the request. GET() and POST() are functions from httr. | ||||
|  | ||||
| # Note. A recent bug in the interaction between the server expectations and the | ||||
| # curl client libraries requires the following initialization | ||||
| httr::set_config(httr::config(http_version = 0)) | ||||
| # cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b | ||||
|  | ||||
|  | ||||
| URL <- "https://www.uniprot.org/mapping/" | ||||
| response <- httr::POST(URL, | ||||
|                        body = list(from = "P_REFSEQ_AC",   # Refseq Protein | ||||
|                                    to = "ACC",             # UniProt ID | ||||
|                                    format = "tab", | ||||
|                                    query = myQueryIDs)) | ||||
|  | ||||
| cat(httr::content(response)) | ||||
|  | ||||
| # We need to check the status code - if it is not 200, an error ocurred and we | ||||
| # can't process the result: | ||||
| httr::status_code(response) | ||||
|  | ||||
| # If the query is successful, tabbed text is returned. We can assign that to a | ||||
| # data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument. | ||||
|  | ||||
| myMappedIDs <- read.delim(file = textConnection(httr::content(response)), | ||||
|                           sep = "\t", | ||||
|                           stringsAsFactors = FALSE) | ||||
| myMappedIDs | ||||
|  | ||||
| # If this works as expected, you should see: | ||||
| #        From     To | ||||
| # 1 NP_010227 P39678 | ||||
| # 2 NP_011036 P25302 | ||||
| # | ||||
| # ... and note that there are only two entries, because nothing was returned | ||||
| # for the dummy "RefSeq ID" NP_00000 | ||||
|  | ||||
| # If the query can't be fulfilled because of a problem with the server, a | ||||
| # WebPage is returned. But the server status is also returned and we can check | ||||
| # the status code. I have lately gotten many "503" status codes: Server Not | ||||
| # Available... | ||||
|  | ||||
| # We wrap this into a function: | ||||
|  | ||||
| myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { | ||||
|   # Use UniProt ID mapping service to map one or more IDs | ||||
|   # Parameters: | ||||
|   #    s  char  A string of separated IDs | ||||
|   #    mapFrom  char  the database in which the IDs in s are valid. Default | ||||
|   #                     is RefSeq protein | ||||
|   #    mapTo    char  the database in which the target IDs are valid. Default | ||||
|   #                     is UniProtKB | ||||
|   # Value | ||||
|   #    a data frame of mapped IDs, with column names From and To, or an | ||||
|   #    empty data frame if the mapping was unsuccessful. No rows are returned | ||||
|   #    for IDs that are not mapped. | ||||
|  | ||||
|   # Initialize curl | ||||
|   httr::set_config(httr::config(http_version = 0)) | ||||
|  | ||||
|   URL <- "https://www.uniprot.org/uploadlists/" | ||||
|   response <- httr::POST(URL, | ||||
|                          body = list(from = mapFrom, | ||||
|                                      to = mapTo, | ||||
|                                      format = "tab", | ||||
|                                      query = s)) | ||||
|  | ||||
|   if (httr::status_code(response) == 200) { # 200: oK | ||||
|     myMap <- read.delim(file = textConnection(httr::content(response)), | ||||
|                         sep = "\t", | ||||
|                         stringsAsFactors = FALSE) | ||||
|     colnames(myMap) <- c("From", "To") | ||||
|   } else { | ||||
|     myMap <- data.frame() | ||||
|     warning(paste("No uniProt ID mapping returned:", | ||||
|                   "server sent status", | ||||
|                   httr::status_code(response))) | ||||
|   } | ||||
|  | ||||
|   return(myMap) | ||||
| } | ||||
|  | ||||
| # Try it out ... | ||||
| myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165") | ||||
|  | ||||
| # A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded | ||||
| # into your workspace on startup. | ||||
|  | ||||
|  | ||||
| # =    2  Cross-referencing tables  ============================================ | ||||
|  | ||||
| # Sometimes we get the IDs we need to map in a large table, e.g. from a list of | ||||
| # genes in a model organism database such as SGD, or from the Human Genen | ||||
| # Nomenclature commission. How do we map one set of identifiers to another one? | ||||
|  | ||||
| # The function to use is match(). | ||||
| # Here is a tiny set of identifiers taken from a much larger table to | ||||
| # illustrate the principle: | ||||
| # | ||||
|  | ||||
| myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747", | ||||
|                               "Q08641", "P47129", "P52910", "P00330", "P81450"), | ||||
|                     name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4", | ||||
|                               "AB140", "ACF4", "ACS2", "ADH1", "ATP18"), | ||||
|                     refID = c("NP_014657", "NP_009386", | ||||
|                               "NP_012683", "NP_012559", | ||||
|                               "NP_010038", "NP_014882", | ||||
|                               "NP_012616", "NP_013254", | ||||
|                               "NP_014555", "NP_013629")) | ||||
|  | ||||
| myIDs | ||||
|  | ||||
| # Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to | ||||
| # their gene names. | ||||
| myQuery <- c("NP_010038", "NP_999999", "NP_013629") | ||||
|  | ||||
| # %in% will only tell us if these IDs are present in the table: | ||||
| myQuery %in% myIDs$refID | ||||
|  | ||||
| # ... but not where they are located. But match() does what we need here: | ||||
| match(myQuery, myIDs$refID) | ||||
|  | ||||
| # ... and we can use the result to subset the column that we want to map to: | ||||
| myIDs$name[match(myQuery, myIDs$refID)] | ||||
|  | ||||
| # Note that the output preserves the NA - i.e. the length of the mapped | ||||
| # values is exactly the same as the length of the query. | ||||
|  | ||||
| # task: map the three genes to their UniProt Identifier. | ||||
|  | ||||
|  | ||||
| # | ||||
| # Note: if you want to do very many queries in very large tables, use the | ||||
| # fmatch() function in the "fastmatch" package for a considerable | ||||
| # speedup. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,435 +1,435 @@ | ||||
| # tocID <- "BIN-FUNC-Domain_annotation.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-FUNC-Domain_annotation unit. | ||||
| # | ||||
| # ============================================================================== | ||||
| # Version:  1.4 | ||||
| # | ||||
| # Date:     2017-11  -  2020-10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.4    Add code for shared data import from the Wiki | ||||
| #           1.3    Add code for database export to JSON and instructions | ||||
| #                  for uploading annotations to the Public Student Wiki page | ||||
| #           1.2    Consistently: data in ./myScripts/ ; | ||||
| #                    begin SHARING DATA section | ||||
| #           1.1    2020 Updates | ||||
| #           1.0    Live version 2017 | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # TODO: | ||||
| #           Put the domain plot into a function | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                                 Line | ||||
| #TOC> --------------------------------------------------------------------- | ||||
| #TOC>   1        Update your database script                             51 | ||||
| #TOC>   1.1        Preparing an annotation file ...                      58 | ||||
| #TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61 | ||||
| #TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109 | ||||
| #TOC>   1.2        Execute and Validate                                 136 | ||||
| #TOC>   2        Plot Annotations                                       161 | ||||
| #TOC>   3        SHARING DATA                                           287 | ||||
| #TOC>   3.1        Post MBP1_MYSPE as JSON data                         303 | ||||
| #TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Update your database script  ========================================= | ||||
|  | ||||
|  | ||||
| # Since you have recorded domain features at the SMART database, we can store | ||||
| # the feature annotations in myDB ... | ||||
|  | ||||
|  | ||||
| # ==   1.1  Preparing an annotation file ...  ================================== | ||||
|  | ||||
|  | ||||
| # ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment" | ||||
| # | ||||
| #   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: | ||||
| # | ||||
| #   You DON'T already have a file called "<MYSPE>-Annotations.json" in the | ||||
| #   ./myScripts/ directory: | ||||
| # | ||||
| #   - Make a copy of the file "./data/refAnnotations.json" and put it in your | ||||
| #     myScripts/ directory. | ||||
| # | ||||
| #   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. | ||||
| #     if MYSPE is called "Crptycoccus neoformans", your file should be called | ||||
| #     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is | ||||
| #     "MBP1_CRYNE"). | ||||
| # | ||||
| #   - Open the file in the RStudio editor and delete all blocks for | ||||
| #     the Mbp1 protein annotations except the first one. | ||||
| # | ||||
| #   - From that block, delete all lines that have annotations you did not | ||||
| #     find in SMART for MBP1_MYSPE. | ||||
| # | ||||
| #   - Make enough copies of the "Ankyrin fold" and "low complexity" region | ||||
| #     lines to have a line for each feature you found. | ||||
| # | ||||
| #   - Then delete the comma at the end of the last line. | ||||
| # | ||||
| #   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere | ||||
| #     and change the "start" and "end" features to the coordinates you | ||||
| #     recorded in the SMART database. | ||||
| # | ||||
| #   - Save your file in the ./myScripts/ folder. | ||||
| # | ||||
| #   - Validate your file online at https://jsonlint.com/ | ||||
| # | ||||
| #   - Update your "./myScripts/makeProteinDB.R" script to load your new | ||||
| #     annotation when you recreate the database. Open the script in the | ||||
| #     RStudio editor, and add the following command at the end: | ||||
| # | ||||
| #     myDB <- dbAddAnnotation(myDB, | ||||
| #         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) | ||||
| #                                         ^^^^^^^ | ||||
| #                                        edit this! | ||||
| # | ||||
| #   - save and close the file. | ||||
| # | ||||
| # Then SKIP the next section. | ||||
| # | ||||
| # | ||||
| # ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"   | ||||
| # | ||||
| #   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: | ||||
| # | ||||
| #   You SHOULD have a file called "<MYSPE>-Annotations.json" in the | ||||
| #  ./myScripts/ directory: | ||||
| # | ||||
| #   - Open the file in the RStudio editor. | ||||
| # | ||||
| #   - Make as many copies of the "APSES fold" line as you have found | ||||
| #     features in SMART. | ||||
| # | ||||
| #   - Add a comma after every line except for the last one | ||||
| # | ||||
| #   - Edit the annotations but include only features that are in the | ||||
| #     myDB$feature table. Check which features are in the database by executing | ||||
| # | ||||
| #        myDB$feature$name | ||||
| # | ||||
| #   - Update the "start" and "end" coordinates for each feature to the | ||||
| #     values you found. | ||||
| # | ||||
| #   - Save your file. | ||||
| # | ||||
| #   - Validate your file online at https://jsonlint.com/ | ||||
| # | ||||
| # | ||||
| # ==   1.2  Execute and Validate  ============================================== | ||||
| # | ||||
| #   - source() your database creation script: | ||||
| # | ||||
| #     source("./myScripts/makeProteinDB.R") | ||||
| # | ||||
| #     This should run without errors or warnings. If it doesn't work and you | ||||
| #     can't figure out quickly what's happening, ask for help on the | ||||
| #     Discussion Board. | ||||
| # | ||||
| #   - Confirm | ||||
| #     The following commands should retrieve all of the features that have been | ||||
| #     annotated for MBP1_MYSPE | ||||
|  | ||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||
|  | ||||
| (proID  <- myDB$protein$ID[sel]) | ||||
| (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID]) | ||||
| (ftrIDs <- unique(myDB$annotation$featureID[fanIDs])) | ||||
| myDB$feature$name[ftrIDs] # This should list ALL of your annotated features | ||||
|                           # (once). If not, consider what could have gone wrong | ||||
|                           # and ask on the list if you have difficulties fixing | ||||
|                           # it. | ||||
|  | ||||
|  | ||||
| # =    2  Plot Annotations  ==================================================== | ||||
|  | ||||
| # In this section we will plot domain annotations as colored rectangles on a | ||||
| # sequence, as an example of using the R plotting system for generic, data | ||||
| # driven images. | ||||
|  | ||||
| # We need a small utility function that draws the annotation boxes on a | ||||
| # representation of sequence. It should accept the start and end coordinates, | ||||
| # the y value where it should be plotted and the color of the box, and plot a | ||||
| # rectangle using R's rect() function. | ||||
|  | ||||
| drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) { | ||||
|   # Draw a box from xStart to xEnd at y, filled with colour myCol | ||||
|   # The height of the box is y +- DELTA | ||||
|   rect(xStart, (y - DELTA), xEnd, (y + DELTA), | ||||
|        border = "black", col = myCol) | ||||
| } | ||||
|  | ||||
| # test this: | ||||
| plot(c(-1.5, 1.5), c(0, 0), type = "l") | ||||
| drawBox(-1, 1, 0.0, "peachpuff") | ||||
|  | ||||
| # Next, we define a function to plot annotations for one protein: the name of | ||||
| # the protein, a horizontal grey line for its length, and all of its features. | ||||
|  | ||||
| plotProtein <- function(DB, name, y) { | ||||
|   # DB: protein database | ||||
|   # name: the name of the protein in the database. | ||||
|   # y: height where to draw the plot | ||||
|   # | ||||
|   # Define colors: we create a vector of color values, one for | ||||
|   # each feature, and we give it names of the feature ID. Then we | ||||
|   # can easily get the color value from the feature name. | ||||
|   # A: make a vector of color values. The syntax may appear unusual - | ||||
|   #    colorRampPalette() returns a function, and we simply append | ||||
|   #    the parameter (number-of-features) without assigning the function | ||||
|   #    to its own variable name. | ||||
|   ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00", | ||||
|                                "#62C923", "#0A9A9B", "#1958C3", | ||||
|                                "#8000D3", "#D0007F"), | ||||
|                              space="Lab", | ||||
|                              interpolate="linear")(nrow(DB$feature)) | ||||
|   # B: Features may overlap, so we make the colors transparent by setting | ||||
|   #    their "alpha channel" to 1/3  (hex: 55) | ||||
|   ftrCol <- paste0(ftrCol, "55") | ||||
|   # C: we asssign names | ||||
|   names(ftrCol) <- DB$feature$ID | ||||
|   # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ] | ||||
|  | ||||
|   # find the row-index of the protein ID in the protein table of DB | ||||
|   iProtein <- which(DB$protein$name == name) | ||||
|  | ||||
|   # write the name of the protein | ||||
|   text(-30, y, adj=1, labels=name, cex=0.75 ) | ||||
|  | ||||
|   #draw a line from 0 to nchar(sequence-of-the-protein) | ||||
|   lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y), | ||||
|         lwd=3, col="#999999") | ||||
|  | ||||
|   # get the rows of feature annotations for the protein | ||||
|   iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein]) | ||||
|  | ||||
|   # draw a colored box for each feature | ||||
|   for (i in iFtr) { | ||||
|     drawBox(DB$annotation$start[i], | ||||
|             DB$annotation$end[i], | ||||
|             y, | ||||
|             ftrCol[ DB$annotation$featureID[i] ]) | ||||
|   } | ||||
| } | ||||
|  | ||||
| # Plot each annotated protein: | ||||
| # Get the rows of all unique annotated Mbp1 proteins in myDB | ||||
|  | ||||
| iRows <- grep("^MBP1_", myDB$protein$name) | ||||
|  | ||||
| # define the size of the plot-frame to accomodate all proteins | ||||
| yMax <- length(iRows) * 1.1 | ||||
| xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence | ||||
|  | ||||
| # plot an empty frame | ||||
| oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and | ||||
|                                         # decrease margins | ||||
| plot(1, 1, | ||||
|      xlim = c(-200, xMax + 100), | ||||
|      ylim = c(0, yMax), | ||||
|      type = "n", | ||||
|      axes = FALSE, | ||||
|      bty = "n", | ||||
|      main = "Mbp1 orthologue domain annotations", | ||||
|      xlab = "sequence position", | ||||
|      cex.axis = 0.8, | ||||
|      ylab="") | ||||
| axis(1, at = seq(0, xMax, by = 100)) | ||||
| myCol <- colorRampPalette(c("#f2003c", "#F0A200", | ||||
|                             "#f0ea00", "#62C923", | ||||
|                             "#0A9A9B", "#1958C3", | ||||
|                             "#8000D3", "#D0007F"), | ||||
|                           space="Lab", | ||||
|                           interpolate="linear")(nrow(myDB$feature)) | ||||
| myCol <- paste0(myCol, "55") | ||||
| legend(xMax - 150, 7, | ||||
|        legend = myDB$feature$name, | ||||
|        cex = 0.7, | ||||
|        fill = myCol, | ||||
|        bty = "n") | ||||
|  | ||||
| # Finally, iterate over all proteins and call plotProtein() | ||||
| for (i in seq_along(iRows)) { | ||||
|   plotProtein(myDB, myDB$protein$name[iRows[i]], i) | ||||
| } | ||||
| par(oPar)  # reset the plot parameters | ||||
|  | ||||
|  | ||||
| # The plot shows what is variable and what is constant about the annotations in | ||||
| # a group of related proteins. Your MBP1_MYSPE annotations should appear at the | ||||
| # top. | ||||
|  | ||||
| # Task: | ||||
| #    Put a copy of the plot into your journal and interpret it with respect | ||||
| #    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot. | ||||
|  | ||||
| # Task: | ||||
| #    It would be better to align the motif borders, at least approximately (not | ||||
| #    all proteins have all motifs). How would you go about doing that? | ||||
|  | ||||
| # =    3  SHARING DATA  ======================================================== | ||||
|  | ||||
| # It's particularly interesting to compare such annotations across many | ||||
| # homologous proteins. I have created a page on the Student Wiki () that you can | ||||
| # edit, and then download the data from the entire class directly to your | ||||
| # RStudio project. | ||||
| # | ||||
|  | ||||
| # I have provided a function that extracts all information that refers to a | ||||
| # single protein from the database, and prints it out as well-formatted JSON, | ||||
| # suitable to be pasted into our shareable Wiki-page. There is a fair amount of | ||||
| # bookkeeping involved, but the code is not otherwise very enlightening so I | ||||
| # will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you | ||||
| # would want to have a look. | ||||
|  | ||||
|  | ||||
| # ==   3.1  Post MBP1_MYSPE as JSON data  ====================================== | ||||
|  | ||||
| # Task: | ||||
| # ===== | ||||
| # 1: Run the following code: | ||||
|  | ||||
| cat("{{Vspace}}", | ||||
|     "<!-- ==== BEGIN  PROTEIN ==== -->", | ||||
|     "<pre class=\"protein-data\">", | ||||
|     dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))), | ||||
|     "</pre>", | ||||
|     "<!-- ===== END PROTEIN ====== -->", | ||||
|     "", sep = "\n" | ||||
| ) | ||||
|  | ||||
| # 2: Copy the entire output from the console. | ||||
| # 3: Navigate to | ||||
| #      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public | ||||
| #    ... edit the page, and paste your output at the top. | ||||
| # 4: Save your edits. | ||||
|  | ||||
|  | ||||
|  | ||||
| # ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================ | ||||
|  | ||||
| # Once we have collected a number of protein annotations, we can access the | ||||
| # Wiki-page and import the data into our database. The Wiki page is  an html | ||||
| # document with lots of MediaWiki specific stuff - but the contents we are | ||||
| # interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These | ||||
| # work like normal HTML <pre> tags, but we have defined a special class for them | ||||
| # to make it easy to parse out the contents we want. The rvest:: package in | ||||
| # combination with xml2:: provides us with all the tools we need for such | ||||
| # "Webscraping" of data.... | ||||
|  | ||||
| if (! requireNamespace("rvest", quietly=TRUE)) { | ||||
|   install.packages("rvest") | ||||
| } | ||||
|  | ||||
| if (! requireNamespace("xml2", quietly=TRUE)) { | ||||
|   install.packages("xml2") | ||||
| } | ||||
|  | ||||
| # Here's the process: | ||||
| # The URL is an "open" page on the student Wiki. Users that are not logged in | ||||
| # can view the contents, but you can only edit if you are logged in. | ||||
| myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public" | ||||
|  | ||||
| # First thing is to retrieve the HTML from the url... | ||||
| x <- xml2::read_html(myURL) | ||||
|  | ||||
| # This retrieves the page source, but that still needs to be parsed into its | ||||
| # logical elements. HTML is a subset of XML and such documents are structured as | ||||
| # trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes() | ||||
| # parses out the document structure and then uses a so-called "xpath" expression | ||||
| # to select nodes we are interested in. Now, xpath is one of those specialized | ||||
| # languages of which there are a few more to learn than one would care for. You | ||||
| # MUST know how to format sprintf() expressions, and you SHOULD be competent | ||||
| # with regular expressions. But if you want to be really competent in your work, | ||||
| # basic HTML and CSS is required ... and enough knowledge about xpath to be able | ||||
| # to search on Stackoverflow for what you need for parsing data out of Web | ||||
| # documents... | ||||
|  | ||||
| # The expression we use below is: | ||||
| #   - get any node anywhere in the tree ("//*") ... | ||||
| #   - that has a particular attribute("[@ ... ]"). | ||||
| #   - The attribute we want is that the class of the node is "protein-data"; | ||||
| #      that is the class we have defined for our <pre> tags. | ||||
| # As a result of this selection, we get a list of pointers to the document tree. | ||||
| y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]') | ||||
|  | ||||
| # Next we fetch the actual payload - the text - from the tree: | ||||
| # rvest::html_text() gets the text from the list of pointers. The result is a | ||||
| # normal list of character strings. | ||||
| z <- rvest::html_text(y) | ||||
|  | ||||
| # Finally we can iterate over the list, and add all proteins we don't already | ||||
| # have to our database. There may well be items that are rejected because they | ||||
| # are already present in the database - for example, unless somebody has | ||||
| # annotated new features, all of the features are already there. Don't worry - | ||||
| # that is intended; we don't want duplicate entries. | ||||
|  | ||||
| for (thisJSON in z) { | ||||
|   thisData <- jsonlite::fromJSON(thisJSON) | ||||
|   if (! thisData$protein$name %in% myDB$protein$name) { | ||||
|     myDB <- dbAddProtein(myDB, thisData$protein) | ||||
|     myDB <- dbAddTaxonomy(myDB, thisData$taxonomy) | ||||
|     myDB <- dbAddFeature(myDB, thisData$feature) | ||||
|     myDB <- dbAddAnnotation(myDB, thisData$annotation) | ||||
|   } | ||||
| } | ||||
|  | ||||
| # Finally, we can repeat our domain plot with the results - which now includes the shared proteins: | ||||
|  | ||||
| iRows <- grep("^MBP1_", myDB$protein$name) | ||||
| yMax <- length(iRows) * 1.1 | ||||
| xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence | ||||
|  | ||||
| # plot an empty frame | ||||
| oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) | ||||
| plot(1, 1, | ||||
|      xlim = c(-200, xMax + 100), | ||||
|      ylim = c(0, yMax), | ||||
|      type = "n", | ||||
|      axes = FALSE, | ||||
|      bty = "n", | ||||
|      main = "Mbp1 orthologue domain annotations", | ||||
|      xlab = "sequence position", | ||||
|      cex.axis = 0.8, | ||||
|      ylab="") | ||||
| axis(1, at = seq(0, xMax, by = 100)) | ||||
| myCol <- colorRampPalette(c("#f2003c", "#F0A200", | ||||
|                             "#f0ea00", "#62C923", | ||||
|                             "#0A9A9B", "#1958C3", | ||||
|                             "#8000D3", "#D0007F"), | ||||
|                           space="Lab", | ||||
|                           interpolate="linear")(nrow(myDB$feature)) | ||||
| myCol <- paste0(myCol, "55") | ||||
| legend(xMax - 150, 7, | ||||
|        legend = myDB$feature$name, | ||||
|        cex = 0.7, | ||||
|        fill = myCol, | ||||
|        bty = "n") | ||||
|  | ||||
| for (i in seq_along(iRows)) { | ||||
|   plotProtein(myDB, myDB$protein$name[iRows[i]], i) | ||||
| } | ||||
| par(oPar)  # reset the plot parameters | ||||
|  | ||||
| # ... the more proteins we can compare, the more we learn about the | ||||
| # architectural principles of this family's domains. | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-FUNC-Domain_annotation.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-FUNC-Domain_annotation unit. | ||||
| # | ||||
| # ============================================================================== | ||||
| # Version:  1.4 | ||||
| # | ||||
| # Date:     2017-11  -  2020-10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.4    Add code for shared data import from the Wiki | ||||
| #           1.3    Add code for database export to JSON and instructions | ||||
| #                  for uploading annotations to the Public Student Wiki page | ||||
| #           1.2    Consistently: data in ./myScripts/ ; | ||||
| #                    begin SHARING DATA section | ||||
| #           1.1    2020 Updates | ||||
| #           1.0    Live version 2017 | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # TODO: | ||||
| #           Put the domain plot into a function | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                                 Line | ||||
| #TOC> --------------------------------------------------------------------- | ||||
| #TOC>   1        Update your database script                             51 | ||||
| #TOC>   1.1        Preparing an annotation file ...                      58 | ||||
| #TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61 | ||||
| #TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109 | ||||
| #TOC>   1.2        Execute and Validate                                 136 | ||||
| #TOC>   2        Plot Annotations                                       161 | ||||
| #TOC>   3        SHARING DATA                                           287 | ||||
| #TOC>   3.1        Post MBP1_MYSPE as JSON data                         303 | ||||
| #TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Update your database script  ========================================= | ||||
|  | ||||
|  | ||||
| # Since you have recorded domain features at the SMART database, we can store | ||||
| # the feature annotations in myDB ... | ||||
|  | ||||
|  | ||||
| # ==   1.1  Preparing an annotation file ...  ================================== | ||||
|  | ||||
|  | ||||
| # ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment" | ||||
| # | ||||
| #   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: | ||||
| # | ||||
| #   You DON'T already have a file called "<MYSPE>-Annotations.json" in the | ||||
| #   ./myScripts/ directory: | ||||
| # | ||||
| #   - Make a copy of the file "./data/refAnnotations.json" and put it in your | ||||
| #     myScripts/ directory. | ||||
| # | ||||
| #   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. | ||||
| #     if MYSPE is called "Crptycoccus neoformans", your file should be called | ||||
| #     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is | ||||
| #     "MBP1_CRYNE"). | ||||
| # | ||||
| #   - Open the file in the RStudio editor and delete all blocks for | ||||
| #     the Mbp1 protein annotations except the first one. | ||||
| # | ||||
| #   - From that block, delete all lines that have annotations you did not | ||||
| #     find in SMART for MBP1_MYSPE. | ||||
| # | ||||
| #   - Make enough copies of the "Ankyrin fold" and "low complexity" region | ||||
| #     lines to have a line for each feature you found. | ||||
| # | ||||
| #   - Then delete the comma at the end of the last line. | ||||
| # | ||||
| #   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere | ||||
| #     and change the "start" and "end" features to the coordinates you | ||||
| #     recorded in the SMART database. | ||||
| # | ||||
| #   - Save your file in the ./myScripts/ folder. | ||||
| # | ||||
| #   - Validate your file online at https://jsonlint.com/ | ||||
| # | ||||
| #   - Update your "./myScripts/makeProteinDB.R" script to load your new | ||||
| #     annotation when you recreate the database. Open the script in the | ||||
| #     RStudio editor, and add the following command at the end: | ||||
| # | ||||
| #     myDB <- dbAddAnnotation(myDB, | ||||
| #         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) | ||||
| #                                         ^^^^^^^ | ||||
| #                                        edit this! | ||||
| # | ||||
| #   - save and close the file. | ||||
| # | ||||
| # Then SKIP the next section. | ||||
| # | ||||
| # | ||||
| # ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"   | ||||
| # | ||||
| #   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: | ||||
| # | ||||
| #   You SHOULD have a file called "<MYSPE>-Annotations.json" in the | ||||
| #  ./myScripts/ directory: | ||||
| # | ||||
| #   - Open the file in the RStudio editor. | ||||
| # | ||||
| #   - Make as many copies of the "APSES fold" line as you have found | ||||
| #     features in SMART. | ||||
| # | ||||
| #   - Add a comma after every line except for the last one | ||||
| # | ||||
| #   - Edit the annotations but include only features that are in the | ||||
| #     myDB$feature table. Check which features are in the database by executing | ||||
| # | ||||
| #        myDB$feature$name | ||||
| # | ||||
| #   - Update the "start" and "end" coordinates for each feature to the | ||||
| #     values you found. | ||||
| # | ||||
| #   - Save your file. | ||||
| # | ||||
| #   - Validate your file online at https://jsonlint.com/ | ||||
| # | ||||
| # | ||||
| # ==   1.2  Execute and Validate  ============================================== | ||||
| # | ||||
| #   - source() your database creation script: | ||||
| # | ||||
| #     source("./myScripts/makeProteinDB.R") | ||||
| # | ||||
| #     This should run without errors or warnings. If it doesn't work and you | ||||
| #     can't figure out quickly what's happening, ask for help on the | ||||
| #     Discussion Board. | ||||
| # | ||||
| #   - Confirm | ||||
| #     The following commands should retrieve all of the features that have been | ||||
| #     annotated for MBP1_MYSPE | ||||
|  | ||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||
|  | ||||
| (proID  <- myDB$protein$ID[sel]) | ||||
| (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID]) | ||||
| (ftrIDs <- unique(myDB$annotation$featureID[fanIDs])) | ||||
| myDB$feature$name[ftrIDs] # This should list ALL of your annotated features | ||||
|                           # (once). If not, consider what could have gone wrong | ||||
|                           # and ask on the list if you have difficulties fixing | ||||
|                           # it. | ||||
|  | ||||
|  | ||||
| # =    2  Plot Annotations  ==================================================== | ||||
|  | ||||
| # In this section we will plot domain annotations as colored rectangles on a | ||||
| # sequence, as an example of using the R plotting system for generic, data | ||||
| # driven images. | ||||
|  | ||||
| # We need a small utility function that draws the annotation boxes on a | ||||
| # representation of sequence. It should accept the start and end coordinates, | ||||
| # the y value where it should be plotted and the color of the box, and plot a | ||||
| # rectangle using R's rect() function. | ||||
|  | ||||
| drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) { | ||||
|   # Draw a box from xStart to xEnd at y, filled with colour myCol | ||||
|   # The height of the box is y +- DELTA | ||||
|   rect(xStart, (y - DELTA), xEnd, (y + DELTA), | ||||
|        border = "black", col = myCol) | ||||
| } | ||||
|  | ||||
| # test this: | ||||
| plot(c(-1.5, 1.5), c(0, 0), type = "l") | ||||
| drawBox(-1, 1, 0.0, "peachpuff") | ||||
|  | ||||
| # Next, we define a function to plot annotations for one protein: the name of | ||||
| # the protein, a horizontal grey line for its length, and all of its features. | ||||
|  | ||||
| plotProtein <- function(DB, name, y) { | ||||
|   # DB: protein database | ||||
|   # name: the name of the protein in the database. | ||||
|   # y: height where to draw the plot | ||||
|   # | ||||
|   # Define colors: we create a vector of color values, one for | ||||
|   # each feature, and we give it names of the feature ID. Then we | ||||
|   # can easily get the color value from the feature name. | ||||
|   # A: make a vector of color values. The syntax may appear unusual - | ||||
|   #    colorRampPalette() returns a function, and we simply append | ||||
|   #    the parameter (number-of-features) without assigning the function | ||||
|   #    to its own variable name. | ||||
|   ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00", | ||||
|                                "#62C923", "#0A9A9B", "#1958C3", | ||||
|                                "#8000D3", "#D0007F"), | ||||
|                              space="Lab", | ||||
|                              interpolate="linear")(nrow(DB$feature)) | ||||
|   # B: Features may overlap, so we make the colors transparent by setting | ||||
|   #    their "alpha channel" to 1/3  (hex: 55) | ||||
|   ftrCol <- paste0(ftrCol, "55") | ||||
|   # C: we asssign names | ||||
|   names(ftrCol) <- DB$feature$ID | ||||
|   # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ] | ||||
|  | ||||
|   # find the row-index of the protein ID in the protein table of DB | ||||
|   iProtein <- which(DB$protein$name == name) | ||||
|  | ||||
|   # write the name of the protein | ||||
|   text(-30, y, adj=1, labels=name, cex=0.75 ) | ||||
|  | ||||
|   #draw a line from 0 to nchar(sequence-of-the-protein) | ||||
|   lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y), | ||||
|         lwd=3, col="#999999") | ||||
|  | ||||
|   # get the rows of feature annotations for the protein | ||||
|   iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein]) | ||||
|  | ||||
|   # draw a colored box for each feature | ||||
|   for (i in iFtr) { | ||||
|     drawBox(DB$annotation$start[i], | ||||
|             DB$annotation$end[i], | ||||
|             y, | ||||
|             ftrCol[ DB$annotation$featureID[i] ]) | ||||
|   } | ||||
| } | ||||
|  | ||||
| # Plot each annotated protein: | ||||
| # Get the rows of all unique annotated Mbp1 proteins in myDB | ||||
|  | ||||
| iRows <- grep("^MBP1_", myDB$protein$name) | ||||
|  | ||||
| # define the size of the plot-frame to accomodate all proteins | ||||
| yMax <- length(iRows) * 1.1 | ||||
| xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence | ||||
|  | ||||
| # plot an empty frame | ||||
| oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and | ||||
|                                         # decrease margins | ||||
| plot(1, 1, | ||||
|      xlim = c(-200, xMax + 100), | ||||
|      ylim = c(0, yMax), | ||||
|      type = "n", | ||||
|      axes = FALSE, | ||||
|      bty = "n", | ||||
|      main = "Mbp1 orthologue domain annotations", | ||||
|      xlab = "sequence position", | ||||
|      cex.axis = 0.8, | ||||
|      ylab="") | ||||
| axis(1, at = seq(0, xMax, by = 100)) | ||||
| myCol <- colorRampPalette(c("#f2003c", "#F0A200", | ||||
|                             "#f0ea00", "#62C923", | ||||
|                             "#0A9A9B", "#1958C3", | ||||
|                             "#8000D3", "#D0007F"), | ||||
|                           space="Lab", | ||||
|                           interpolate="linear")(nrow(myDB$feature)) | ||||
| myCol <- paste0(myCol, "55") | ||||
| legend(xMax - 150, 7, | ||||
|        legend = myDB$feature$name, | ||||
|        cex = 0.7, | ||||
|        fill = myCol, | ||||
|        bty = "n") | ||||
|  | ||||
| # Finally, iterate over all proteins and call plotProtein() | ||||
| for (i in seq_along(iRows)) { | ||||
|   plotProtein(myDB, myDB$protein$name[iRows[i]], i) | ||||
| } | ||||
| par(oPar)  # reset the plot parameters | ||||
|  | ||||
|  | ||||
| # The plot shows what is variable and what is constant about the annotations in | ||||
| # a group of related proteins. Your MBP1_MYSPE annotations should appear at the | ||||
| # top. | ||||
|  | ||||
| # Task: | ||||
| #    Put a copy of the plot into your journal and interpret it with respect | ||||
| #    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot. | ||||
|  | ||||
| # Task: | ||||
| #    It would be better to align the motif borders, at least approximately (not | ||||
| #    all proteins have all motifs). How would you go about doing that? | ||||
|  | ||||
| # =    3  SHARING DATA  ======================================================== | ||||
|  | ||||
| # It's particularly interesting to compare such annotations across many | ||||
| # homologous proteins. I have created a page on the Student Wiki () that you can | ||||
| # edit, and then download the data from the entire class directly to your | ||||
| # RStudio project. | ||||
| # | ||||
|  | ||||
| # I have provided a function that extracts all information that refers to a | ||||
| # single protein from the database, and prints it out as well-formatted JSON, | ||||
| # suitable to be pasted into our shareable Wiki-page. There is a fair amount of | ||||
| # bookkeeping involved, but the code is not otherwise very enlightening so I | ||||
| # will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you | ||||
| # would want to have a look. | ||||
|  | ||||
|  | ||||
| # ==   3.1  Post MBP1_MYSPE as JSON data  ====================================== | ||||
|  | ||||
| # Task: | ||||
| # ===== | ||||
| # 1: Run the following code: | ||||
|  | ||||
| cat("{{Vspace}}", | ||||
|     "<!-- ==== BEGIN  PROTEIN ==== -->", | ||||
|     "<pre class=\"protein-data\">", | ||||
|     dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))), | ||||
|     "</pre>", | ||||
|     "<!-- ===== END PROTEIN ====== -->", | ||||
|     "", sep = "\n" | ||||
| ) | ||||
|  | ||||
| # 2: Copy the entire output from the console. | ||||
| # 3: Navigate to | ||||
| #      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public | ||||
| #    ... edit the page, and paste your output at the top. | ||||
| # 4: Save your edits. | ||||
|  | ||||
|  | ||||
|  | ||||
| # ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================ | ||||
|  | ||||
| # Once we have collected a number of protein annotations, we can access the | ||||
| # Wiki-page and import the data into our database. The Wiki page is  an html | ||||
| # document with lots of MediaWiki specific stuff - but the contents we are | ||||
| # interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These | ||||
| # work like normal HTML <pre> tags, but we have defined a special class for them | ||||
| # to make it easy to parse out the contents we want. The rvest:: package in | ||||
| # combination with xml2:: provides us with all the tools we need for such | ||||
| # "Webscraping" of data.... | ||||
|  | ||||
| if (! requireNamespace("rvest", quietly=TRUE)) { | ||||
|   install.packages("rvest") | ||||
| } | ||||
|  | ||||
| if (! requireNamespace("xml2", quietly=TRUE)) { | ||||
|   install.packages("xml2") | ||||
| } | ||||
|  | ||||
| # Here's the process: | ||||
| # The URL is an "open" page on the student Wiki. Users that are not logged in | ||||
| # can view the contents, but you can only edit if you are logged in. | ||||
| myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public" | ||||
|  | ||||
| # First thing is to retrieve the HTML from the url... | ||||
| x <- xml2::read_html(myURL) | ||||
|  | ||||
| # This retrieves the page source, but that still needs to be parsed into its | ||||
| # logical elements. HTML is a subset of XML and such documents are structured as | ||||
| # trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes() | ||||
| # parses out the document structure and then uses a so-called "xpath" expression | ||||
| # to select nodes we are interested in. Now, xpath is one of those specialized | ||||
| # languages of which there are a few more to learn than one would care for. You | ||||
| # MUST know how to format sprintf() expressions, and you SHOULD be competent | ||||
| # with regular expressions. But if you want to be really competent in your work, | ||||
| # basic HTML and CSS is required ... and enough knowledge about xpath to be able | ||||
| # to search on Stackoverflow for what you need for parsing data out of Web | ||||
| # documents... | ||||
|  | ||||
| # The expression we use below is: | ||||
| #   - get any node anywhere in the tree ("//*") ... | ||||
| #   - that has a particular attribute("[@ ... ]"). | ||||
| #   - The attribute we want is that the class of the node is "protein-data"; | ||||
| #      that is the class we have defined for our <pre> tags. | ||||
| # As a result of this selection, we get a list of pointers to the document tree. | ||||
| y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]') | ||||
|  | ||||
| # Next we fetch the actual payload - the text - from the tree: | ||||
| # rvest::html_text() gets the text from the list of pointers. The result is a | ||||
| # normal list of character strings. | ||||
| z <- rvest::html_text(y) | ||||
|  | ||||
| # Finally we can iterate over the list, and add all proteins we don't already | ||||
| # have to our database. There may well be items that are rejected because they | ||||
| # are already present in the database - for example, unless somebody has | ||||
| # annotated new features, all of the features are already there. Don't worry - | ||||
| # that is intended; we don't want duplicate entries. | ||||
|  | ||||
| for (thisJSON in z) { | ||||
|   thisData <- jsonlite::fromJSON(thisJSON) | ||||
|   if (! thisData$protein$name %in% myDB$protein$name) { | ||||
|     myDB <- dbAddProtein(myDB, thisData$protein) | ||||
|     myDB <- dbAddTaxonomy(myDB, thisData$taxonomy) | ||||
|     myDB <- dbAddFeature(myDB, thisData$feature) | ||||
|     myDB <- dbAddAnnotation(myDB, thisData$annotation) | ||||
|   } | ||||
| } | ||||
|  | ||||
| # Finally, we can repeat our domain plot with the results - which now includes the shared proteins: | ||||
|  | ||||
| iRows <- grep("^MBP1_", myDB$protein$name) | ||||
| yMax <- length(iRows) * 1.1 | ||||
| xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence | ||||
|  | ||||
| # plot an empty frame | ||||
| oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) | ||||
| plot(1, 1, | ||||
|      xlim = c(-200, xMax + 100), | ||||
|      ylim = c(0, yMax), | ||||
|      type = "n", | ||||
|      axes = FALSE, | ||||
|      bty = "n", | ||||
|      main = "Mbp1 orthologue domain annotations", | ||||
|      xlab = "sequence position", | ||||
|      cex.axis = 0.8, | ||||
|      ylab="") | ||||
| axis(1, at = seq(0, xMax, by = 100)) | ||||
| myCol <- colorRampPalette(c("#f2003c", "#F0A200", | ||||
|                             "#f0ea00", "#62C923", | ||||
|                             "#0A9A9B", "#1958C3", | ||||
|                             "#8000D3", "#D0007F"), | ||||
|                           space="Lab", | ||||
|                           interpolate="linear")(nrow(myDB$feature)) | ||||
| myCol <- paste0(myCol, "55") | ||||
| legend(xMax - 150, 7, | ||||
|        legend = myDB$feature$name, | ||||
|        cex = 0.7, | ||||
|        fill = myCol, | ||||
|        bty = "n") | ||||
|  | ||||
| for (i in seq_along(iRows)) { | ||||
|   plotProtein(myDB, myDB$protein$name[iRows[i]], i) | ||||
| } | ||||
| par(oPar)  # reset the plot parameters | ||||
|  | ||||
| # ... the more proteins we can compare, the more we learn about the | ||||
| # architectural principles of this family's domains. | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,169 +1,169 @@ | ||||
| # tocID <- "BIN-FUNC-Semantic_similarity.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-FUNC_Semantic_similarity unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-11  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0    New code. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                                Line | ||||
| #TOC> -------------------------------------------------------------------- | ||||
| #TOC>   1        Preparations: Packages, AnnotationDB, Setup            43 | ||||
| #TOC>   2        Fetch GO Annotations                                  100 | ||||
| #TOC>   3        Semantic Similarities                                 109 | ||||
| #TOC>   4        GO Term Enrichment in Gene Sets                       127 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Preparations: Packages, AnnotationDB, Setup  ========================= | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
|  | ||||
| # GOSim is an R-package in the Bioconductor project. | ||||
| if (! requireNamespace("GOSim", quietly = TRUE)) { | ||||
|   BiocManager::install("GOSim") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = GOSim)       # basic information | ||||
| #  browseVignettes("GOSim")    # available vignettes | ||||
| #  data(package = "GOSim")     # available datasets | ||||
|  | ||||
| # GOSim makes extensive assumptions about loaded packages, and many base | ||||
| # methods are masked. We will thus use library(GOSim) to load it | ||||
| # in its entirety and with all packages it depends on. We will still use | ||||
| # the <package>::<function>() syntax in the code below, but this now serves | ||||
| # more of a didactic purpose, rather than actual syntax requirements. | ||||
|  | ||||
| library(GOSim) | ||||
|  | ||||
| # GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast | ||||
| # annotations instead... | ||||
| if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) { | ||||
|   BiocManager::install("org.Sc.sgd.db") | ||||
| } | ||||
|  | ||||
| # Bioconductor annotation packages won't work stably unless we actually load | ||||
| # them: | ||||
| library(org.Sc.sgd.db) | ||||
|  | ||||
| # org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such | ||||
| # databases exist for all model organisms. It's a kind of a fancy data frame | ||||
| # from which we can get annotations by rows (genes) with the keys() funtion ... | ||||
| AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510] | ||||
|  | ||||
| # ... and the types of available annotations with the columns() function | ||||
| AnnotationDbi::columns(org.Sc.sgd.db) | ||||
|  | ||||
| # Note that one of the columns is "GO" ... and we load that into the | ||||
| # datastructures used by GOSim: | ||||
|  | ||||
| # Choose GOterms to use | ||||
| GOSim::setEvidenceLevel(evidences = "all", | ||||
|                         organism = org.Sc.sgdORGANISM, | ||||
|                         gomap = org.Sc.sgdGO) | ||||
|  | ||||
| # Use Biological Process ontology | ||||
| GOSim::setOntology("BP", loadIC = FALSE) | ||||
|  | ||||
| # confirm that we loaded the correct ontology | ||||
| head(get("gomap", envir = GOSimEnv)) | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    2  Fetch GO Annotations  ================================================ | ||||
|  | ||||
|  | ||||
| # All keys being used here are yeast systematic names. | ||||
|  | ||||
| # Get one set of annotations | ||||
| GOSim::getGOInfo(c("YDL056W"))  # Mbp1 | ||||
|  | ||||
|  | ||||
| # =    3  Semantic Similarities  =============================================== | ||||
|  | ||||
|  | ||||
| # Get semantic similarities between genes | ||||
| ?getGeneSim | ||||
|  | ||||
| # There are _many_ different metrics of term similarity implemented | ||||
| # in this package. | ||||
|  | ||||
|                                                          # Mbp1 and... | ||||
| GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex | ||||
| GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators | ||||
| GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator | ||||
| GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist | ||||
| GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist | ||||
| GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis | ||||
|  | ||||
|  | ||||
| # =    4  GO Term Enrichment in Gene Sets  ===================================== | ||||
|  | ||||
|  | ||||
| # Calculating GO term enrichment in gene sets is done with the Bioconductor | ||||
| # topGO package. | ||||
| if (! requireNamespace("topGO", quietly = TRUE)) { | ||||
|   BiocManager::install("topGO") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = topGO)       # basic information | ||||
| #  browseVignettes("topGO")    # available vignettes | ||||
| #  data(package = "topGO")     # available datasets | ||||
|  | ||||
| # Once again - assumptions are made by GOsim that require us to load the | ||||
| # topGO package wholesale: | ||||
| library(topGO) | ||||
|  | ||||
| # Let's define a gene set: GOterm enrichment for G1/S switch activators: | ||||
| mySet <- c("YFR028C", # Cdc14 | ||||
|            "YDL056W", # Mbp1 | ||||
|            "YLR182W", # Swi6 | ||||
|            "YER111C", # Swi4 | ||||
|            "YOR083W", # Whi5 | ||||
|            "YBR160W", # Cdc28 | ||||
|            "YMR199W", # Cln1 | ||||
|            "YPL256C", # Cln2 | ||||
|            "YAL040C") # Cln3 | ||||
|  | ||||
| allGenes <- AnnotationDbi::keys(org.Sc.sgd.db) | ||||
| allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which | ||||
|                                             # we define enrichment | ||||
|  | ||||
| myEnr <- GOenrichment(mySet, allGenes) | ||||
|  | ||||
| sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ... | ||||
|  | ||||
| #Most significantly enriched is GO:0071931. What is this? | ||||
| annotate::getGOTerm("GO:0071931")  # ... makes sense. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-FUNC-Semantic_similarity.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-FUNC_Semantic_similarity unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-11  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0    New code. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                                Line | ||||
| #TOC> -------------------------------------------------------------------- | ||||
| #TOC>   1        Preparations: Packages, AnnotationDB, Setup            43 | ||||
| #TOC>   2        Fetch GO Annotations                                  100 | ||||
| #TOC>   3        Semantic Similarities                                 109 | ||||
| #TOC>   4        GO Term Enrichment in Gene Sets                       127 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Preparations: Packages, AnnotationDB, Setup  ========================= | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
|  | ||||
| # GOSim is an R-package in the Bioconductor project. | ||||
| if (! requireNamespace("GOSim", quietly = TRUE)) { | ||||
|   BiocManager::install("GOSim") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = GOSim)       # basic information | ||||
| #  browseVignettes("GOSim")    # available vignettes | ||||
| #  data(package = "GOSim")     # available datasets | ||||
|  | ||||
| # GOSim makes extensive assumptions about loaded packages, and many base | ||||
| # methods are masked. We will thus use library(GOSim) to load it | ||||
| # in its entirety and with all packages it depends on. We will still use | ||||
| # the <package>::<function>() syntax in the code below, but this now serves | ||||
| # more of a didactic purpose, rather than actual syntax requirements. | ||||
|  | ||||
| library(GOSim) | ||||
|  | ||||
| # GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast | ||||
| # annotations instead... | ||||
| if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) { | ||||
|   BiocManager::install("org.Sc.sgd.db") | ||||
| } | ||||
|  | ||||
| # Bioconductor annotation packages won't work stably unless we actually load | ||||
| # them: | ||||
| library(org.Sc.sgd.db) | ||||
|  | ||||
| # org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such | ||||
| # databases exist for all model organisms. It's a kind of a fancy data frame | ||||
| # from which we can get annotations by rows (genes) with the keys() funtion ... | ||||
| AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510] | ||||
|  | ||||
| # ... and the types of available annotations with the columns() function | ||||
| AnnotationDbi::columns(org.Sc.sgd.db) | ||||
|  | ||||
| # Note that one of the columns is "GO" ... and we load that into the | ||||
| # datastructures used by GOSim: | ||||
|  | ||||
| # Choose GOterms to use | ||||
| GOSim::setEvidenceLevel(evidences = "all", | ||||
|                         organism = org.Sc.sgdORGANISM, | ||||
|                         gomap = org.Sc.sgdGO) | ||||
|  | ||||
| # Use Biological Process ontology | ||||
| GOSim::setOntology("BP", loadIC = FALSE) | ||||
|  | ||||
| # confirm that we loaded the correct ontology | ||||
| head(get("gomap", envir = GOSimEnv)) | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    2  Fetch GO Annotations  ================================================ | ||||
|  | ||||
|  | ||||
| # All keys being used here are yeast systematic names. | ||||
|  | ||||
| # Get one set of annotations | ||||
| GOSim::getGOInfo(c("YDL056W"))  # Mbp1 | ||||
|  | ||||
|  | ||||
| # =    3  Semantic Similarities  =============================================== | ||||
|  | ||||
|  | ||||
| # Get semantic similarities between genes | ||||
| ?getGeneSim | ||||
|  | ||||
| # There are _many_ different metrics of term similarity implemented | ||||
| # in this package. | ||||
|  | ||||
|                                                          # Mbp1 and... | ||||
| GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex | ||||
| GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators | ||||
| GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator | ||||
| GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist | ||||
| GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist | ||||
| GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis | ||||
|  | ||||
|  | ||||
| # =    4  GO Term Enrichment in Gene Sets  ===================================== | ||||
|  | ||||
|  | ||||
| # Calculating GO term enrichment in gene sets is done with the Bioconductor | ||||
| # topGO package. | ||||
| if (! requireNamespace("topGO", quietly = TRUE)) { | ||||
|   BiocManager::install("topGO") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = topGO)       # basic information | ||||
| #  browseVignettes("topGO")    # available vignettes | ||||
| #  data(package = "topGO")     # available datasets | ||||
|  | ||||
| # Once again - assumptions are made by GOsim that require us to load the | ||||
| # topGO package wholesale: | ||||
| library(topGO) | ||||
|  | ||||
| # Let's define a gene set: GOterm enrichment for G1/S switch activators: | ||||
| mySet <- c("YFR028C", # Cdc14 | ||||
|            "YDL056W", # Mbp1 | ||||
|            "YLR182W", # Swi6 | ||||
|            "YER111C", # Swi4 | ||||
|            "YOR083W", # Whi5 | ||||
|            "YBR160W", # Cdc28 | ||||
|            "YMR199W", # Cln1 | ||||
|            "YPL256C", # Cln2 | ||||
|            "YAL040C") # Cln3 | ||||
|  | ||||
| allGenes <- AnnotationDbi::keys(org.Sc.sgd.db) | ||||
| allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which | ||||
|                                             # we define enrichment | ||||
|  | ||||
| myEnr <- GOenrichment(mySet, allGenes) | ||||
|  | ||||
| sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ... | ||||
|  | ||||
| #Most significantly enriched is GO:0071931. What is this? | ||||
| annotate::getGOTerm("GO:0071931")  # ... makes sense. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										702
									
								
								BIN-MYSPE.R
									
									
									
									
									
								
							
							
						
						
									
										702
									
								
								BIN-MYSPE.R
									
									
									
									
									
								
							| @@ -1,351 +1,351 @@ | ||||
| # tocID <- "BIN-MYSPE.R" | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-MYSPE unit | ||||
| # | ||||
| # | ||||
| # Version: 1.4 | ||||
| # | ||||
| # Date:    2017-09 - 2021-10 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # V 1.4    Add troubleshooting hints via errText[[...]] | ||||
| # V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about | ||||
| # V 1.2    Reorganized proportional plot section into a "further reading" | ||||
| #          section, added nested-box, and sankey plot visualization of | ||||
| #          proportions. Introduced plotly. | ||||
| # V 1.1    2020 Workflow changes | ||||
| # V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory | ||||
| # V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist | ||||
| # V 0.1    First code copied from BCH441_A03_makeMYSPElist.R | ||||
| # | ||||
| # TODO:    Sample solution for sankey plot function. | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # DO NOT SIMPLY  source()  THESE FILES! | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                             Line | ||||
| #TOC> ----------------------------------------------------------------- | ||||
| #TOC>   1        PREPARATIONS                                        52 | ||||
| #TOC>   2        SUITABLE MYSPE SPECIES                              65 | ||||
| #TOC>   3        ADOPT "MYSPE"                                       89 | ||||
| #TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128 | ||||
| #TOC>   4.1        Percentages                                      146 | ||||
| #TOC>   4.2        Visualizing proportions: Pie chart               165 | ||||
| #TOC>   4.3        Visualizing proportions: Nested squares          243 | ||||
| #TOC>   4.4        Visualizing proportions: Sankey diagrams         280 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  PREPARATIONS  ======================================================== | ||||
| # | ||||
|  | ||||
| # Execute the two conditionals below: | ||||
| if (! file.exists("./myScripts/.myProfile.R")) { | ||||
|   stop(errText[["noProfileFile"]])     # message defined in .Rprofile | ||||
| } | ||||
|  | ||||
| if (! exists("myStudentNumber")) { | ||||
|   stop(errText[["noStudentNumber"]])   # message defined in .Rprofile | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    2  SUITABLE MYSPE SPECIES  ============================================== | ||||
|  | ||||
|  | ||||
| # In this unit we will select one species from a list of genome sequenced fungi | ||||
| # and write it into your personalized profile file. This species will be called | ||||
| # "MYSPE" (My Species) for other learning units and exercises. | ||||
|  | ||||
| # A detailed description of the process of compiling the list of genome | ||||
| # sequenced fungi with protein annotations and Mbp1 homologues is in the file | ||||
| # ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi | ||||
| # was retrieved from https://fungi.ensembl.org; a search for homologues to | ||||
| # yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged. | ||||
| # A representative organism at each genus-level was chosen from those hits | ||||
| # that actual;ly have a homologue. Finally, a mapping table was constructed to | ||||
| # asymmetrically retrieve unique species: a student number will retrieve | ||||
| # a species, but (public) knowledge of the species cannot reconstruct the | ||||
| # student number. | ||||
|  | ||||
| # Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow | ||||
| #       of selecting and combining data from various data resources. Studying | ||||
| #       it will give you a better sense of how such workflows can be | ||||
| #       implemented in practice. | ||||
|  | ||||
|  | ||||
| # =    3  ADOPT "MYSPE"  ======================================================= | ||||
|  | ||||
| # Execute: | ||||
| ( MYSPE <- getMYSPE(myStudentNumber) ) | ||||
|  | ||||
| # If this produced an error, this session has not been properly set up. You | ||||
| # may not yet have run  init()  and edited  .myProfile.R , or that file is not | ||||
| # in your  myScripts/  folder. Fix this, and execute: | ||||
| # | ||||
| #    source(".Rprofile") . | ||||
|  | ||||
| # If this produced NA, your Student Number may not be correct, or you are not in | ||||
| # my class-list. Contact me. Otherwise, this should have printed a species name, | ||||
| # and the taxonomy ID of its genome-sequenced strain. This is your unique | ||||
| # speciesfor this course. Note it in your journal ... | ||||
|  | ||||
| biCode(MYSPE) # and also note it's "BiCode" ... | ||||
| ( myTaxID <- names(MYSPE) )  # and its taxID | ||||
|  | ||||
|  | ||||
| # Task: | ||||
| # ===== | ||||
| #   Note down the species name and its five letter BiCode on your Student | ||||
| #   Wiki user page. Use this species whenever this or future assignments refer | ||||
| #   to MYSPE. Whenever you start a session, it will automatically be loaded | ||||
| #   from  myScripts/.myProfile.R  and is available as  MYSPE . | ||||
|  | ||||
| # Here is some more information about MYSPE, taken from the table of genome- | ||||
| # sequenced fungi that is in your ./data folder. | ||||
| fungiDat <- read.csv("data/Species.csv") | ||||
| iMs <- which(fungiDat$Taxon.ID == myTaxID) | ||||
|  | ||||
| ( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order | ||||
| ( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus | ||||
| ( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain | ||||
|  | ||||
| # That's all. | ||||
|  | ||||
|  | ||||
| # =    4  FURTHER READING: PLOTTING PROPORTIONS  =============================== | ||||
|  | ||||
| # The material below is an exploration of data-preparation and plotting | ||||
| # techniques; you can treat this as additional practice and further reading and | ||||
| # I expect that some of the code and plotting examples may be useful in a | ||||
| # different context. | ||||
|  | ||||
| # A frequent task is to visualize the proportion of elements with given | ||||
| # categories in a sample. For example, we might ask what the proportion of the | ||||
| # different orders of fungi is the order of MYSPE? Let's first collect the | ||||
| # numbers. | ||||
|  | ||||
| ( nFungi <- nrow(fungiDat) )                            # sequenced fungi | ||||
| ( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE | ||||
| ( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE | ||||
| ( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE | ||||
|  | ||||
|  | ||||
| # ==   4.1  Percentages  ======================================================= | ||||
|  | ||||
| # The zeroth-order approach to visualization is simply to print percentages: | ||||
|  | ||||
| cat(sprintf("\n%s comprise %5.2f%% of fungi.", | ||||
|         myOr, | ||||
|         (nOrder * 100) / nFungi)) | ||||
|  | ||||
| # ... or, adding the actual numbers: | ||||
|  | ||||
| cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).", | ||||
|             myOr, | ||||
|             (nOrder * 100) / nFungi, | ||||
|             nOrder, | ||||
|             nFungi)) | ||||
|  | ||||
| # But that's hard to visualize for most of us, and anyway, we don't know how | ||||
| # that relates to other orders. | ||||
|  | ||||
| # ==   4.2  Visualizing proportions: Pie chart  ================================ | ||||
|  | ||||
| # Often, we will use a pie chart instead. Pie charts are rather informal types | ||||
| # of plots, not well suited for analysis. But easy to do: | ||||
|  | ||||
| # Define four colors to identify the four categories | ||||
| pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0") | ||||
|  | ||||
| oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0 | ||||
|                                            # and remember the | ||||
|                                            # previous setting | ||||
|  | ||||
| pie(c(nSpecies,                            # subtract numbers since these | ||||
|       nGenus - nSpecies,                   # categories are mutually contained | ||||
|       nOrder - nGenus - nSpecies,          # in each other | ||||
|       nFungi - nOrder - nGenus - nSpecies), | ||||
|       labels = "", | ||||
|       radius = 0.9, | ||||
|       main = "MYSPE in genome-sequenced fungi", | ||||
|       lty = 0,                             # turn borders for wedges off | ||||
|       col = pCol, | ||||
|       clockwise = TRUE, | ||||
|       init.angle = 90) | ||||
|  | ||||
| title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot | ||||
|  | ||||
| legend(x = 0.95, y = 0.8,    # place at legend here | ||||
|        legend = c("Species", "Genus", "Order", "Fungi"), | ||||
|        y.intersp = 2,                      # line spacing for labels | ||||
|        cex = 0.8,                          # character size for labels | ||||
|        bty = "n",                          # "no" box around the legend | ||||
|        pt.cex = 2,                         # size of colour boxes | ||||
|        pch = 15,                           # a filled square | ||||
|        col = pCol) | ||||
|  | ||||
| par(oPar)                                  # reset graphics state | ||||
|  | ||||
| # Unless MYSPE is one of the frequently sequenced species, there will only be a | ||||
| # very thin wedge visible. Pie charts are not well suited to visualize small | ||||
| # proportions. | ||||
|  | ||||
| # It is a little more useful if we have non-nested proportions - like the | ||||
| # number of species in the same order overall: | ||||
|  | ||||
| myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE) | ||||
| head(myTbl) | ||||
|  | ||||
| # pie() does a reasonable job out of the box to interpret table() data: | ||||
| pie(myTbl) | ||||
|  | ||||
| # ... we can improve this quickly with a bit of tweaking: | ||||
|  | ||||
| N <- length(myTbl) | ||||
| sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere | ||||
|  | ||||
| myCol <- rep(pCol[4], N)       # N elements of pCol[1] | ||||
| myCol[sel] <- pCol[1]          # replace this one color | ||||
|  | ||||
| myLbl <- rep("", N)            # N labels of "" | ||||
| myLbl[sel] <- myOr             # replace this one label with the MYSPE order | ||||
|  | ||||
|  | ||||
| oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0 | ||||
|  | ||||
| pie(myTbl, | ||||
|     labels = myLbl, | ||||
|     radius = 0.9, | ||||
|     main = "MYSPE order", | ||||
|     border = "#DDDDDD", | ||||
|     col = myCol, | ||||
|     clockwise = TRUE, | ||||
|     init.angle = 90) | ||||
|  | ||||
| par(oPar)                                  # reset graphics state | ||||
|  | ||||
| # But the overall problem remains. | ||||
|  | ||||
|  | ||||
| # ==   4.3  Visualizing proportions: Nested squares  =========================== | ||||
|  | ||||
| # A simple alternative is to draw such proportions as nested squares: | ||||
|  | ||||
| x <- sqrt(nFungi) | ||||
|  | ||||
| # set margins to ~ 0 and type to square | ||||
| oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s") | ||||
|  | ||||
| # empty, square plot | ||||
| plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x), | ||||
|      type="n", axes=FALSE, xlab="", ylab="") | ||||
|  | ||||
| # basic square for all genomes | ||||
| rect(0, 0, x,              x,              col = pCol[4]) | ||||
|  | ||||
| # grid | ||||
| u <- 0:floor(x) | ||||
| N <- length(u) | ||||
| segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18") | ||||
| segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18") | ||||
| # each square on this grid is one genome | ||||
|  | ||||
| # colored squares | ||||
| rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3]) | ||||
| rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2]) | ||||
| rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1]) | ||||
|  | ||||
| # labels | ||||
| text(x/2, x/2,      "Fungi") | ||||
| text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9) | ||||
| text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8) | ||||
| text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7) | ||||
|  | ||||
| par(oPar)                                  # reset graphics state | ||||
|  | ||||
|  | ||||
| # ==   4.4  Visualizing proportions: Sankey diagrams  ========================== | ||||
|  | ||||
| # Sankey diagrams are an excellent way to visualize complicated nested | ||||
| # proportions and their changes (see here for example: | ||||
| # https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple | ||||
| # example with the MYSPE proportions, as an illustration of the plotting | ||||
| # principle. | ||||
|  | ||||
| if (! requireNamespace("plotly")) { | ||||
|   install.packages("plotly") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help   = plotly)     # basic information | ||||
| #  browseVignettes("plotly")    # available vignettes | ||||
| #  data(package  = "plotly")    # available datasets | ||||
|  | ||||
| # Here, we use the plotly package that wraps a very well developed javascript | ||||
| # library with many options for interactive plots. I am producing this plot | ||||
| # hard-coded for the sample organism "Sporothrix schenkii"; you would need | ||||
| # to change the code to adapt it to your own MYSPE - or even build a function | ||||
| # for this. Do try this if you have a bit of coding experience, sankey diagrams | ||||
| # are a good way to show hierarchical data relations - and if you get this | ||||
| # working for your own organism you can be proud that you have understood | ||||
| # how preparing the data works. | ||||
|  | ||||
|  | ||||
| myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID | ||||
|                           "Ophiostomatales (6)",       # 1 | ||||
|                           "Other...",                  # 2 | ||||
|                           "Sporothrix (4)",            # 3 | ||||
|                           "Other...",                  # 4 | ||||
|                           "Sporothrix schenckii (2)",  # 5 | ||||
|                           "Other..."                   # 6 | ||||
|                           ), | ||||
|                 x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0), | ||||
|                 y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7), | ||||
|                 color = c("#f2f2f0", # | ||||
|                           "#ffd5c4", | ||||
|                           "#CCCCCC", | ||||
|                           "#ff9582", | ||||
|                           "#CCCCCC", | ||||
|                           "#ed394e", | ||||
|                           "#CCCCCC" | ||||
|                           ), | ||||
|                 pad = 15, | ||||
|                 thickness = 20, | ||||
|                 line = list(color = "black", | ||||
|                             width = 0.5)) | ||||
|  | ||||
| myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of | ||||
|                 target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0 | ||||
|                 value =  c(6, 18, 4, 2, 2, 2))  # and node 1 | ||||
|  | ||||
| # Setting up the actual plot ... | ||||
| fig  <-  plotly::plot_ly(type = "sankey", | ||||
|                          arrangement = "snap", | ||||
|                          orientation = "h", | ||||
|                          node = myNodes, | ||||
|                          link = myLinks) | ||||
|  | ||||
| # Adding and adjusting a few layout parameters | ||||
| fig <- plotly::layout(fig, | ||||
|               title = "Fungi Genomes - Classification", | ||||
|               font = list(size = 10)) | ||||
|  | ||||
| fig     # plot the diagram | ||||
|  | ||||
| # Note that the plot appears in the Viewer window, not the Plot window, and that | ||||
| # it is interactive: you can hover over nodes and links, and drag the nodes | ||||
| # around. | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-MYSPE.R" | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-MYSPE unit | ||||
| # | ||||
| # | ||||
| # Version: 1.4 | ||||
| # | ||||
| # Date:    2017-09 - 2021-10 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # V 1.4    Add troubleshooting hints via errText[[...]] | ||||
| # V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about | ||||
| # V 1.2    Reorganized proportional plot section into a "further reading" | ||||
| #          section, added nested-box, and sankey plot visualization of | ||||
| #          proportions. Introduced plotly. | ||||
| # V 1.1    2020 Workflow changes | ||||
| # V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory | ||||
| # V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist | ||||
| # V 0.1    First code copied from BCH441_A03_makeMYSPElist.R | ||||
| # | ||||
| # TODO:    Sample solution for sankey plot function. | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # DO NOT SIMPLY  source()  THESE FILES! | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                             Line | ||||
| #TOC> ----------------------------------------------------------------- | ||||
| #TOC>   1        PREPARATIONS                                        52 | ||||
| #TOC>   2        SUITABLE MYSPE SPECIES                              65 | ||||
| #TOC>   3        ADOPT "MYSPE"                                       89 | ||||
| #TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128 | ||||
| #TOC>   4.1        Percentages                                      146 | ||||
| #TOC>   4.2        Visualizing proportions: Pie chart               165 | ||||
| #TOC>   4.3        Visualizing proportions: Nested squares          243 | ||||
| #TOC>   4.4        Visualizing proportions: Sankey diagrams         280 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  PREPARATIONS  ======================================================== | ||||
| # | ||||
|  | ||||
| # Execute the two conditionals below: | ||||
| if (! file.exists("./myScripts/.myProfile.R")) { | ||||
|   stop(errText[["noProfileFile"]])     # message defined in .Rprofile | ||||
| } | ||||
|  | ||||
| if (! exists("myStudentNumber")) { | ||||
|   stop(errText[["noStudentNumber"]])   # message defined in .Rprofile | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    2  SUITABLE MYSPE SPECIES  ============================================== | ||||
|  | ||||
|  | ||||
| # In this unit we will select one species from a list of genome sequenced fungi | ||||
| # and write it into your personalized profile file. This species will be called | ||||
| # "MYSPE" (My Species) for other learning units and exercises. | ||||
|  | ||||
| # A detailed description of the process of compiling the list of genome | ||||
| # sequenced fungi with protein annotations and Mbp1 homologues is in the file | ||||
| # ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi | ||||
| # was retrieved from https://fungi.ensembl.org; a search for homologues to | ||||
| # yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged. | ||||
| # A representative organism at each genus-level was chosen from those hits | ||||
| # that actual;ly have a homologue. Finally, a mapping table was constructed to | ||||
| # asymmetrically retrieve unique species: a student number will retrieve | ||||
| # a species, but (public) knowledge of the species cannot reconstruct the | ||||
| # student number. | ||||
|  | ||||
| # Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow | ||||
| #       of selecting and combining data from various data resources. Studying | ||||
| #       it will give you a better sense of how such workflows can be | ||||
| #       implemented in practice. | ||||
|  | ||||
|  | ||||
| # =    3  ADOPT "MYSPE"  ======================================================= | ||||
|  | ||||
| # Execute: | ||||
| ( MYSPE <- getMYSPE(myStudentNumber) ) | ||||
|  | ||||
| # If this produced an error, this session has not been properly set up. You | ||||
| # may not yet have run  init()  and edited  .myProfile.R , or that file is not | ||||
| # in your  myScripts/  folder. Fix this, and execute: | ||||
| # | ||||
| #    source(".Rprofile") . | ||||
|  | ||||
| # If this produced NA, your Student Number may not be correct, or you are not in | ||||
| # my class-list. Contact me. Otherwise, this should have printed a species name, | ||||
| # and the taxonomy ID of its genome-sequenced strain. This is your unique | ||||
| # speciesfor this course. Note it in your journal ... | ||||
|  | ||||
| biCode(MYSPE) # and also note it's "BiCode" ... | ||||
| ( myTaxID <- names(MYSPE) )  # and its taxID | ||||
|  | ||||
|  | ||||
| # Task: | ||||
| # ===== | ||||
| #   Note down the species name and its five letter BiCode on your Student | ||||
| #   Wiki user page. Use this species whenever this or future assignments refer | ||||
| #   to MYSPE. Whenever you start a session, it will automatically be loaded | ||||
| #   from  myScripts/.myProfile.R  and is available as  MYSPE . | ||||
|  | ||||
| # Here is some more information about MYSPE, taken from the table of genome- | ||||
| # sequenced fungi that is in your ./data folder. | ||||
| fungiDat <- read.csv("data/Species.csv") | ||||
| iMs <- which(fungiDat$Taxon.ID == myTaxID) | ||||
|  | ||||
| ( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order | ||||
| ( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus | ||||
| ( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain | ||||
|  | ||||
| # That's all. | ||||
|  | ||||
|  | ||||
| # =    4  FURTHER READING: PLOTTING PROPORTIONS  =============================== | ||||
|  | ||||
| # The material below is an exploration of data-preparation and plotting | ||||
| # techniques; you can treat this as additional practice and further reading and | ||||
| # I expect that some of the code and plotting examples may be useful in a | ||||
| # different context. | ||||
|  | ||||
| # A frequent task is to visualize the proportion of elements with given | ||||
| # categories in a sample. For example, we might ask what the proportion of the | ||||
| # different orders of fungi is the order of MYSPE? Let's first collect the | ||||
| # numbers. | ||||
|  | ||||
| ( nFungi <- nrow(fungiDat) )                            # sequenced fungi | ||||
| ( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE | ||||
| ( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE | ||||
| ( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE | ||||
|  | ||||
|  | ||||
| # ==   4.1  Percentages  ======================================================= | ||||
|  | ||||
| # The zeroth-order approach to visualization is simply to print percentages: | ||||
|  | ||||
| cat(sprintf("\n%s comprise %5.2f%% of fungi.", | ||||
|         myOr, | ||||
|         (nOrder * 100) / nFungi)) | ||||
|  | ||||
| # ... or, adding the actual numbers: | ||||
|  | ||||
| cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).", | ||||
|             myOr, | ||||
|             (nOrder * 100) / nFungi, | ||||
|             nOrder, | ||||
|             nFungi)) | ||||
|  | ||||
| # But that's hard to visualize for most of us, and anyway, we don't know how | ||||
| # that relates to other orders. | ||||
|  | ||||
| # ==   4.2  Visualizing proportions: Pie chart  ================================ | ||||
|  | ||||
| # Often, we will use a pie chart instead. Pie charts are rather informal types | ||||
| # of plots, not well suited for analysis. But easy to do: | ||||
|  | ||||
| # Define four colors to identify the four categories | ||||
| pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0") | ||||
|  | ||||
| oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0 | ||||
|                                            # and remember the | ||||
|                                            # previous setting | ||||
|  | ||||
| pie(c(nSpecies,                            # subtract numbers since these | ||||
|       nGenus - nSpecies,                   # categories are mutually contained | ||||
|       nOrder - nGenus - nSpecies,          # in each other | ||||
|       nFungi - nOrder - nGenus - nSpecies), | ||||
|       labels = "", | ||||
|       radius = 0.9, | ||||
|       main = "MYSPE in genome-sequenced fungi", | ||||
|       lty = 0,                             # turn borders for wedges off | ||||
|       col = pCol, | ||||
|       clockwise = TRUE, | ||||
|       init.angle = 90) | ||||
|  | ||||
| title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot | ||||
|  | ||||
| legend(x = 0.95, y = 0.8,    # place at legend here | ||||
|        legend = c("Species", "Genus", "Order", "Fungi"), | ||||
|        y.intersp = 2,                      # line spacing for labels | ||||
|        cex = 0.8,                          # character size for labels | ||||
|        bty = "n",                          # "no" box around the legend | ||||
|        pt.cex = 2,                         # size of colour boxes | ||||
|        pch = 15,                           # a filled square | ||||
|        col = pCol) | ||||
|  | ||||
| par(oPar)                                  # reset graphics state | ||||
|  | ||||
| # Unless MYSPE is one of the frequently sequenced species, there will only be a | ||||
| # very thin wedge visible. Pie charts are not well suited to visualize small | ||||
| # proportions. | ||||
|  | ||||
| # It is a little more useful if we have non-nested proportions - like the | ||||
| # number of species in the same order overall: | ||||
|  | ||||
| myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE) | ||||
| head(myTbl) | ||||
|  | ||||
| # pie() does a reasonable job out of the box to interpret table() data: | ||||
| pie(myTbl) | ||||
|  | ||||
| # ... we can improve this quickly with a bit of tweaking: | ||||
|  | ||||
| N <- length(myTbl) | ||||
| sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere | ||||
|  | ||||
| myCol <- rep(pCol[4], N)       # N elements of pCol[1] | ||||
| myCol[sel] <- pCol[1]          # replace this one color | ||||
|  | ||||
| myLbl <- rep("", N)            # N labels of "" | ||||
| myLbl[sel] <- myOr             # replace this one label with the MYSPE order | ||||
|  | ||||
|  | ||||
| oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0 | ||||
|  | ||||
| pie(myTbl, | ||||
|     labels = myLbl, | ||||
|     radius = 0.9, | ||||
|     main = "MYSPE order", | ||||
|     border = "#DDDDDD", | ||||
|     col = myCol, | ||||
|     clockwise = TRUE, | ||||
|     init.angle = 90) | ||||
|  | ||||
| par(oPar)                                  # reset graphics state | ||||
|  | ||||
| # But the overall problem remains. | ||||
|  | ||||
|  | ||||
| # ==   4.3  Visualizing proportions: Nested squares  =========================== | ||||
|  | ||||
| # A simple alternative is to draw such proportions as nested squares: | ||||
|  | ||||
| x <- sqrt(nFungi) | ||||
|  | ||||
| # set margins to ~ 0 and type to square | ||||
| oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s") | ||||
|  | ||||
| # empty, square plot | ||||
| plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x), | ||||
|      type="n", axes=FALSE, xlab="", ylab="") | ||||
|  | ||||
| # basic square for all genomes | ||||
| rect(0, 0, x,              x,              col = pCol[4]) | ||||
|  | ||||
| # grid | ||||
| u <- 0:floor(x) | ||||
| N <- length(u) | ||||
| segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18") | ||||
| segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18") | ||||
| # each square on this grid is one genome | ||||
|  | ||||
| # colored squares | ||||
| rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3]) | ||||
| rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2]) | ||||
| rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1]) | ||||
|  | ||||
| # labels | ||||
| text(x/2, x/2,      "Fungi") | ||||
| text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9) | ||||
| text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8) | ||||
| text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7) | ||||
|  | ||||
| par(oPar)                                  # reset graphics state | ||||
|  | ||||
|  | ||||
| # ==   4.4  Visualizing proportions: Sankey diagrams  ========================== | ||||
|  | ||||
| # Sankey diagrams are an excellent way to visualize complicated nested | ||||
| # proportions and their changes (see here for example: | ||||
| # https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple | ||||
| # example with the MYSPE proportions, as an illustration of the plotting | ||||
| # principle. | ||||
|  | ||||
| if (! requireNamespace("plotly")) { | ||||
|   install.packages("plotly") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help   = plotly)     # basic information | ||||
| #  browseVignettes("plotly")    # available vignettes | ||||
| #  data(package  = "plotly")    # available datasets | ||||
|  | ||||
| # Here, we use the plotly package that wraps a very well developed javascript | ||||
| # library with many options for interactive plots. I am producing this plot | ||||
| # hard-coded for the sample organism "Sporothrix schenkii"; you would need | ||||
| # to change the code to adapt it to your own MYSPE - or even build a function | ||||
| # for this. Do try this if you have a bit of coding experience, sankey diagrams | ||||
| # are a good way to show hierarchical data relations - and if you get this | ||||
| # working for your own organism you can be proud that you have understood | ||||
| # how preparing the data works. | ||||
|  | ||||
|  | ||||
| myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID | ||||
|                           "Ophiostomatales (6)",       # 1 | ||||
|                           "Other...",                  # 2 | ||||
|                           "Sporothrix (4)",            # 3 | ||||
|                           "Other...",                  # 4 | ||||
|                           "Sporothrix schenckii (2)",  # 5 | ||||
|                           "Other..."                   # 6 | ||||
|                           ), | ||||
|                 x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0), | ||||
|                 y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7), | ||||
|                 color = c("#f2f2f0", # | ||||
|                           "#ffd5c4", | ||||
|                           "#CCCCCC", | ||||
|                           "#ff9582", | ||||
|                           "#CCCCCC", | ||||
|                           "#ed394e", | ||||
|                           "#CCCCCC" | ||||
|                           ), | ||||
|                 pad = 15, | ||||
|                 thickness = 20, | ||||
|                 line = list(color = "black", | ||||
|                             width = 0.5)) | ||||
|  | ||||
| myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of | ||||
|                 target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0 | ||||
|                 value =  c(6, 18, 4, 2, 2, 2))  # and node 1 | ||||
|  | ||||
| # Setting up the actual plot ... | ||||
| fig  <-  plotly::plot_ly(type = "sankey", | ||||
|                          arrangement = "snap", | ||||
|                          orientation = "h", | ||||
|                          node = myNodes, | ||||
|                          link = myLinks) | ||||
|  | ||||
| # Adding and adjusting a few layout parameters | ||||
| fig <- plotly::layout(fig, | ||||
|               title = "Fungi Genomes - Classification", | ||||
|               font = list(size = 10)) | ||||
|  | ||||
| fig     # plot the diagram | ||||
|  | ||||
| # Note that the plot appears in the Viewer window, not the Plot window, and that | ||||
| # it is interactive: you can hover over nodes and links, and drag the nodes | ||||
| # around. | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,234 +1,234 @@ | ||||
| # tocID <- "BIN-PHYLO-Data_preparation.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-PHYLO-Data_preparation unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0    First 2017 version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                     Line | ||||
| #TOC> --------------------------------------------------------- | ||||
| #TOC>   1        Preparations                                45 | ||||
| #TOC>   2        Fetching sequences                          77 | ||||
| #TOC>   3        Multiple Sequence Alignment                118 | ||||
| #TOC>   4        Reviewing and Editing Alignments           137 | ||||
| #TOC>   4.1        Masking workflow                         153 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Preparations  ======================================================== | ||||
|  | ||||
|  | ||||
| # You need to reload your protein database, including changes that might have | ||||
| # been made to the reference files. If you have worked with the prerequiste | ||||
| # units, you should have a script named "makeProteinDB.R" that will create the | ||||
| # myDB object with a protein and feature database. Ask for advice if not. | ||||
| source("myScripts/makeProteinDB.R") | ||||
|  | ||||
| # Load packages we need | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("msa", quietly = TRUE)) { | ||||
|   BiocManager::install("msa") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = msa)       # basic information | ||||
| #  browseVignettes("msa")  # available vignettes | ||||
| #  data(package = "msa")   # available datasets | ||||
|  | ||||
|  | ||||
| # =    2  Fetching sequences  ================================================== | ||||
|  | ||||
|  | ||||
| # myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1 | ||||
| # RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES | ||||
| # domains. You have annotated their ranges as a feature. The following code | ||||
| # retrieves the sequences from myDB. You have seen similar code in other units. | ||||
|  | ||||
| sel <- grep("^MBP1_", myDB$protein$name) | ||||
| (proNames <- myDB$protein$name[sel]) | ||||
| (proIDs <- myDB$protein$ID[sel]) | ||||
|  | ||||
| (sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||
| (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% ! | ||||
|                               myDB$annotation$featureID == sel])      #  ==  ! | ||||
|                                                                       # Why? | ||||
| APSI <- character(length(fanIDs)) | ||||
|  | ||||
| for (i in seq_along(fanIDs)) { | ||||
|   sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index | ||||
|   proID <- myDB$annotation$proteinID[sel]   # get its protein ID | ||||
|   start <- myDB$annotation$start[sel]       # get start ... | ||||
|   end   <- myDB$annotation$end[sel]         # ... and end | ||||
|  | ||||
|   sel <- myDB$protein$ID == proID           # get the protein row index ... | ||||
|                                             # ... and the sequence | ||||
|   APSI[i] <- substring(myDB$protein$sequence[sel], start, end) | ||||
|   names(APSI)[i] <- (myDB$protein$name[sel]) | ||||
| } | ||||
|  | ||||
| head(APSI) | ||||
|  | ||||
| # Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our | ||||
| # phylogenetic tree (see the unit's Wiki page for details on the sequence). | ||||
|  | ||||
| APSI <- c(APSI, | ||||
| "IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ") | ||||
| names(APSI)[length(APSI)] <- "KILA_ESCCO" | ||||
| tail(APSI) | ||||
|  | ||||
|  | ||||
| # =    3  Multiple Sequence Alignment  ========================================= | ||||
|  | ||||
| # This vector of sequences with named elements fulfills the requirements to be | ||||
| # imported as a Biostrings object - an AAStringSet - which we need as input for | ||||
| # the MSA algorithms in Biostrings. | ||||
| # | ||||
|  | ||||
| APSESSet <- Biostrings::AAStringSet(APSI) | ||||
| APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned") | ||||
|  | ||||
| # Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If | ||||
| # that happens in your case, just use msaClustalOmega() instead. | ||||
|  | ||||
| # inspect the alignment. | ||||
| writeALN(APSESMsa) | ||||
|  | ||||
| # What do you think? Is this a good alignment for phylogenetic inference? | ||||
|  | ||||
|  | ||||
| # =    4  Reviewing and Editing Alignments  ==================================== | ||||
|  | ||||
|  | ||||
| # Head back to the Wiki page for this unit and read up on the background | ||||
| # first. | ||||
|  | ||||
| # Let's mask out all columns that have observations for | ||||
| # less than 1/3 of the sequences in the dataset. This | ||||
| # means they have more than round(nrow(msaSet) * (2/3)) | ||||
| # hyphens in a column. | ||||
| # | ||||
| # We take all sequences, split them into single | ||||
| # characters, and put them into a matrix. Then we | ||||
| # go through the matrix, column by column and decide | ||||
| # whether we want to include that column. | ||||
|  | ||||
| # ==   4.1  Masking workflow  ================================================== | ||||
|  | ||||
| # get the length of the alignment | ||||
| (lenAli <- APSESMsa@unmasked@ranges@width[1]) | ||||
|  | ||||
| # initialize a matrix that can hold all characters | ||||
| # individually | ||||
| msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli), | ||||
|                     ncol = lenAli) | ||||
|  | ||||
| # assign the correct rownames | ||||
| rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES | ||||
| for (i in 1:nrow(APSESMsa)) { | ||||
|   msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), "")) | ||||
| } | ||||
|  | ||||
| # inspect the result | ||||
| msaMatrix[1:7, 30:40] | ||||
|  | ||||
| # Now let's make a logical vector with an element for each column that selects | ||||
| # which columns should be masked out. | ||||
|  | ||||
| # The number of hyphens in a column is easy to count. Consider: | ||||
|  | ||||
|     msaMatrix[ , 20]             # column 20 | ||||
|     msaMatrix[ , 20] == "-"      # TRUE for all gap characters | ||||
| sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE | ||||
|  | ||||
| # Thus filling our logical vector is simple: | ||||
|  | ||||
| # initialize a mask | ||||
| colMask <- logical(ncol(msaMatrix)) | ||||
|  | ||||
| # define the threshold for rejecting a column | ||||
| limit <- round(nrow(APSESMsa) * (2/3)) | ||||
|  | ||||
| # iterate over all columns, and write TRUE if there are less-or-equal to "limit" | ||||
| # hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis | ||||
| # and FALSE columns will be rejected. | ||||
| for (i in 1:ncol(msaMatrix)) { | ||||
|   count <- sum(msaMatrix[ , i] == "-") | ||||
|   colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not | ||||
| } | ||||
|  | ||||
| # Inspect the mask | ||||
| colMask | ||||
|  | ||||
| # How many positions are being kept? | ||||
| sum(colMask) | ||||
|  | ||||
| cat(sprintf("We are masking %4.2f %% of alignment columns.\n", | ||||
|             100 * (1 - (sum(colMask) / length(colMask))))) | ||||
|  | ||||
|  | ||||
| # Next, we use colMask to remove the masked columns from the matrix | ||||
| # in one step: | ||||
| maskedMatrix <- msaMatrix[ , colMask] | ||||
|  | ||||
| # check: | ||||
| ncol(maskedMatrix) | ||||
|  | ||||
| # ... then collapse each row of single characters back into a string ... | ||||
| APSESphyloSet <- character() | ||||
| for (i in 1:nrow(maskedMatrix)) { | ||||
|   APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="") | ||||
| } | ||||
| names(APSESphyloSet) <- rownames(maskedMatrix) | ||||
|  | ||||
| # inspect ... | ||||
| writeALN(APSESphyloSet) | ||||
|  | ||||
| # As you see, we have removed a three residue insertion from MBP1_NEUCR, and | ||||
| # several indels from the KILA_ESCCO outgroup sequence. | ||||
|  | ||||
|  | ||||
| # We save the aligned, masked domains to a file in the data/ directory, | ||||
| # in multi-FASTA format. | ||||
| writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa") | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-PHYLO-Data_preparation.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-PHYLO-Data_preparation unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0    First 2017 version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                     Line | ||||
| #TOC> --------------------------------------------------------- | ||||
| #TOC>   1        Preparations                                45 | ||||
| #TOC>   2        Fetching sequences                          77 | ||||
| #TOC>   3        Multiple Sequence Alignment                118 | ||||
| #TOC>   4        Reviewing and Editing Alignments           137 | ||||
| #TOC>   4.1        Masking workflow                         153 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Preparations  ======================================================== | ||||
|  | ||||
|  | ||||
| # You need to reload your protein database, including changes that might have | ||||
| # been made to the reference files. If you have worked with the prerequiste | ||||
| # units, you should have a script named "makeProteinDB.R" that will create the | ||||
| # myDB object with a protein and feature database. Ask for advice if not. | ||||
| source("myScripts/makeProteinDB.R") | ||||
|  | ||||
| # Load packages we need | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("msa", quietly = TRUE)) { | ||||
|   BiocManager::install("msa") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = msa)       # basic information | ||||
| #  browseVignettes("msa")  # available vignettes | ||||
| #  data(package = "msa")   # available datasets | ||||
|  | ||||
|  | ||||
| # =    2  Fetching sequences  ================================================== | ||||
|  | ||||
|  | ||||
| # myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1 | ||||
| # RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES | ||||
| # domains. You have annotated their ranges as a feature. The following code | ||||
| # retrieves the sequences from myDB. You have seen similar code in other units. | ||||
|  | ||||
| sel <- grep("^MBP1_", myDB$protein$name) | ||||
| (proNames <- myDB$protein$name[sel]) | ||||
| (proIDs <- myDB$protein$ID[sel]) | ||||
|  | ||||
| (sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||
| (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% ! | ||||
|                               myDB$annotation$featureID == sel])      #  ==  ! | ||||
|                                                                       # Why? | ||||
| APSI <- character(length(fanIDs)) | ||||
|  | ||||
| for (i in seq_along(fanIDs)) { | ||||
|   sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index | ||||
|   proID <- myDB$annotation$proteinID[sel]   # get its protein ID | ||||
|   start <- myDB$annotation$start[sel]       # get start ... | ||||
|   end   <- myDB$annotation$end[sel]         # ... and end | ||||
|  | ||||
|   sel <- myDB$protein$ID == proID           # get the protein row index ... | ||||
|                                             # ... and the sequence | ||||
|   APSI[i] <- substring(myDB$protein$sequence[sel], start, end) | ||||
|   names(APSI)[i] <- (myDB$protein$name[sel]) | ||||
| } | ||||
|  | ||||
| head(APSI) | ||||
|  | ||||
| # Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our | ||||
| # phylogenetic tree (see the unit's Wiki page for details on the sequence). | ||||
|  | ||||
| APSI <- c(APSI, | ||||
| "IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ") | ||||
| names(APSI)[length(APSI)] <- "KILA_ESCCO" | ||||
| tail(APSI) | ||||
|  | ||||
|  | ||||
| # =    3  Multiple Sequence Alignment  ========================================= | ||||
|  | ||||
| # This vector of sequences with named elements fulfills the requirements to be | ||||
| # imported as a Biostrings object - an AAStringSet - which we need as input for | ||||
| # the MSA algorithms in Biostrings. | ||||
| # | ||||
|  | ||||
| APSESSet <- Biostrings::AAStringSet(APSI) | ||||
| APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned") | ||||
|  | ||||
| # Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If | ||||
| # that happens in your case, just use msaClustalOmega() instead. | ||||
|  | ||||
| # inspect the alignment. | ||||
| writeALN(APSESMsa) | ||||
|  | ||||
| # What do you think? Is this a good alignment for phylogenetic inference? | ||||
|  | ||||
|  | ||||
| # =    4  Reviewing and Editing Alignments  ==================================== | ||||
|  | ||||
|  | ||||
| # Head back to the Wiki page for this unit and read up on the background | ||||
| # first. | ||||
|  | ||||
| # Let's mask out all columns that have observations for | ||||
| # less than 1/3 of the sequences in the dataset. This | ||||
| # means they have more than round(nrow(msaSet) * (2/3)) | ||||
| # hyphens in a column. | ||||
| # | ||||
| # We take all sequences, split them into single | ||||
| # characters, and put them into a matrix. Then we | ||||
| # go through the matrix, column by column and decide | ||||
| # whether we want to include that column. | ||||
|  | ||||
| # ==   4.1  Masking workflow  ================================================== | ||||
|  | ||||
| # get the length of the alignment | ||||
| (lenAli <- APSESMsa@unmasked@ranges@width[1]) | ||||
|  | ||||
| # initialize a matrix that can hold all characters | ||||
| # individually | ||||
| msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli), | ||||
|                     ncol = lenAli) | ||||
|  | ||||
| # assign the correct rownames | ||||
| rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES | ||||
| for (i in 1:nrow(APSESMsa)) { | ||||
|   msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), "")) | ||||
| } | ||||
|  | ||||
| # inspect the result | ||||
| msaMatrix[1:7, 30:40] | ||||
|  | ||||
| # Now let's make a logical vector with an element for each column that selects | ||||
| # which columns should be masked out. | ||||
|  | ||||
| # The number of hyphens in a column is easy to count. Consider: | ||||
|  | ||||
|     msaMatrix[ , 20]             # column 20 | ||||
|     msaMatrix[ , 20] == "-"      # TRUE for all gap characters | ||||
| sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE | ||||
|  | ||||
| # Thus filling our logical vector is simple: | ||||
|  | ||||
| # initialize a mask | ||||
| colMask <- logical(ncol(msaMatrix)) | ||||
|  | ||||
| # define the threshold for rejecting a column | ||||
| limit <- round(nrow(APSESMsa) * (2/3)) | ||||
|  | ||||
| # iterate over all columns, and write TRUE if there are less-or-equal to "limit" | ||||
| # hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis | ||||
| # and FALSE columns will be rejected. | ||||
| for (i in 1:ncol(msaMatrix)) { | ||||
|   count <- sum(msaMatrix[ , i] == "-") | ||||
|   colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not | ||||
| } | ||||
|  | ||||
| # Inspect the mask | ||||
| colMask | ||||
|  | ||||
| # How many positions are being kept? | ||||
| sum(colMask) | ||||
|  | ||||
| cat(sprintf("We are masking %4.2f %% of alignment columns.\n", | ||||
|             100 * (1 - (sum(colMask) / length(colMask))))) | ||||
|  | ||||
|  | ||||
| # Next, we use colMask to remove the masked columns from the matrix | ||||
| # in one step: | ||||
| maskedMatrix <- msaMatrix[ , colMask] | ||||
|  | ||||
| # check: | ||||
| ncol(maskedMatrix) | ||||
|  | ||||
| # ... then collapse each row of single characters back into a string ... | ||||
| APSESphyloSet <- character() | ||||
| for (i in 1:nrow(maskedMatrix)) { | ||||
|   APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="") | ||||
| } | ||||
| names(APSESphyloSet) <- rownames(maskedMatrix) | ||||
|  | ||||
| # inspect ... | ||||
| writeALN(APSESphyloSet) | ||||
|  | ||||
| # As you see, we have removed a three residue insertion from MBP1_NEUCR, and | ||||
| # several indels from the KILA_ESCCO outgroup sequence. | ||||
|  | ||||
|  | ||||
| # We save the aligned, masked domains to a file in the data/ directory, | ||||
| # in multi-FASTA format. | ||||
| writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa") | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,406 +1,406 @@ | ||||
| # tocID <- "BIN-PHYLO-Tree_analysis.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-PHYLO-Tree_analysis unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 updates. Deprecate iTol and use taxize:: instead. | ||||
| #                  Rewrite of tip re-ordering. Better handling of | ||||
| #                  messages. pBar() for randomization. | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0.2  Typo in variable name, style changes | ||||
| #           1.0.1  Wrong section heading | ||||
| #           1.0    First 2017 version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                              Line | ||||
| #TOC> -------------------------------------------------- | ||||
| #TOC>   1        Preparation and Tree Plot            50 | ||||
| #TOC>   2        SPECIES REFERENCE TREE               66 | ||||
| #TOC>   3        Tree Analysis                       117 | ||||
| #TOC>   3.1        Rooting Trees                     177 | ||||
| #TOC>   3.2        Rotating Clades                   222 | ||||
| #TOC>   3.3        Computing tree distances          309 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Preparation and Tree Plot  =========================================== | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("ape", quietly = TRUE)) { | ||||
|   install.packages("ape") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = ape)       # basic information | ||||
| #  browseVignettes("ape")    # available vignettes | ||||
| #  data(package = "ape")     # available datasets | ||||
|  | ||||
| # We change the graphics parameters from time to time, let's define the | ||||
| # default so we can recreate a sane state: | ||||
| dev.off() | ||||
| PAR <- par() | ||||
|  | ||||
| # =    2  SPECIES REFERENCE TREE  ============================================== | ||||
|  | ||||
| # Before we do any kind of phylogenetic analysis of genes from several species, | ||||
| # we MUST have a reference tree of the taxonomic relationships in hand. This | ||||
| # context is absolutely required for the interpretation of our tree. | ||||
|  | ||||
| # We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package. | ||||
|  | ||||
| if (! requireNamespace("taxize", quietly = TRUE)) { | ||||
|   install.packages("taxize") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help   = taxize)       # basic information | ||||
| #  browseVignettes("taxize")    # available vignettes | ||||
| #  data(package  = "taxize")     # available datasets | ||||
|  | ||||
| ( mySOI <- c(myDB$taxonomy$ID, "83333") ) | ||||
| myClass <- taxize::classification(mySOI, db = "ncbi") | ||||
| str(myClass) | ||||
|  | ||||
| myClass[[1]] | ||||
|  | ||||
| fungiTree <- taxize::class2tree(myClass, check = TRUE) | ||||
| plot(fungiTree) | ||||
|  | ||||
| # The tree produced by taxize:: contains full length species names, | ||||
| # but it would be more convenient if it had bicodes instead. Also, the actual | ||||
| # tree is only part of the list(), which will cause problems later: | ||||
| str(fungiTree) | ||||
|  | ||||
| # we therefor simplify | ||||
| fungiTree <- fungiTree$phylo | ||||
| str(fungiTree) | ||||
|  | ||||
| # The species names are in a vector $phylo$tip.label of this list. | ||||
| # We can use biCode() to shorten them. | ||||
| fungiTree$tip.label <- biCode(fungiTree$tip.label) | ||||
|  | ||||
| # Plot the tree | ||||
| nSP <- length(fungiTree$tip.label) | ||||
| plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE) | ||||
| text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4) | ||||
| ape::nodelabels(text = fungiTree$node.label, | ||||
|                 cex = 0.6, | ||||
|                 adj = 0.2, | ||||
|                 bg = "#D4F2DA") | ||||
| # Note that you can use the arrow buttons in the menu above the plot pane to | ||||
| # scroll back to plots you have created earlier - so you can reference back to | ||||
| # this species tree in your later analysis. | ||||
|  | ||||
|  | ||||
| # =    3  Tree Analysis  ======================================================= | ||||
|  | ||||
|  | ||||
| # 1.1  Visualizing your tree | ||||
| # The trees that are produced by Rphylip are stored as an object of class | ||||
| # "phylo". This is a class for phylogenetic trees that is widely used in the | ||||
| # community, practically all R phylogenetics packages will options to read and | ||||
| # manipulate such trees. Outside of R, a popular interchange format is the | ||||
| # Newick_format that you have seen above. It's easy to output your calculated | ||||
| # trees in Newick format and visualize them elsewhere. | ||||
|  | ||||
| # The "phylo" class object is one of R's "S3" objects and methods to plot and | ||||
| # print it have been defined with the Rphylip package, and in ape. You can | ||||
| # simply call plot(<your-tree>) and R knows what to do with <your-tree> and how | ||||
| # to plot it. The underlying function is plot.phylo(), and documentation for its | ||||
| # many options can by found by typing: | ||||
|  | ||||
| ?plot.phylo | ||||
|  | ||||
| # We load the APSES sequence tree that you produced in the | ||||
| # BIN-PHYLO-Tree_building unit: | ||||
| apsTree <- readRDS(file = "data/APSEStreeRproml.rds") | ||||
|  | ||||
| plot(apsTree) # default type is "phylogram" | ||||
| plot(apsTree, type = "unrooted") | ||||
| plot(apsTree, type = "fan", no.margin = TRUE) | ||||
|  | ||||
| # rescale to show all of the labels: | ||||
| # record the current plot parameters by assigning them to a variable ... | ||||
| (tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE)) | ||||
| # ... and adjust the plot limits for a new plot: | ||||
| plot(apsTree, | ||||
|      type = "fan", | ||||
|      x.lim = tmp$x.lim * 1.8, | ||||
|      y.lim = tmp$y.lim * 1.8, | ||||
|      cex = 0.8, | ||||
|      no.margin = TRUE) | ||||
|  | ||||
| # Inspect the tree object | ||||
| str(apsTree) | ||||
| apsTree$tip.label | ||||
| apsTree$edge | ||||
| apsTree$edge.length | ||||
|  | ||||
| # show the node / edge and tip labels on a plot | ||||
| plot(apsTree) | ||||
| ape::nodelabels() | ||||
| ape::edgelabels() | ||||
| ape::tiplabels() | ||||
|  | ||||
| # show the number of nodes, edges and tips | ||||
| ape::Nnode(apsTree) | ||||
| ape::Nedge(apsTree) | ||||
| ape::Ntip(apsTree) | ||||
|  | ||||
| par(PAR)   # reset graphics state | ||||
|  | ||||
| # Finally, write the tree to console in Newick format | ||||
| ape::write.tree(apsTree) | ||||
|  | ||||
| # ==   3.1  Rooting Trees  ===================================================== | ||||
|  | ||||
| # In order to analyse the tree, it is helpful to root it first and reorder its | ||||
| # clades. Contrary to documentation, Rproml() returns an unrooted tree. | ||||
|  | ||||
| ape::is.rooted(apsTree) | ||||
|  | ||||
| # You can root the tree with the command root() from the "ape" package. | ||||
|  | ||||
| plot(apsTree) | ||||
|  | ||||
| # add labels for internal nodes and tips | ||||
| ape::nodelabels(cex = 0.5, frame = "circle") | ||||
| ape::tiplabels(cex = 0.5, frame = "rect") | ||||
|  | ||||
| # The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different | ||||
| # number in yours. Substitute the correct node number below for "outgroup". | ||||
| apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE) | ||||
| plot(apsTree) | ||||
| ape::is.rooted(apsTree) | ||||
|  | ||||
| # This tree _looks_ unchanged, beacuse when the root trifurcation was resolved, | ||||
| # an edge of length zero was added to connect the MRCA (Most Recent Common | ||||
| # Ancestor) of the ingroup. | ||||
|  | ||||
| # The edge lengths are stored in the phylo object: | ||||
| apsTree$edge.length | ||||
|  | ||||
| # ... and you can assign a small arbitrary value to the edge | ||||
| # to show how it connects to the tree without having an | ||||
| # overlap. | ||||
| apsTree$edge.length[1] <- 0.1 | ||||
| plot(apsTree, cex = 0.7) | ||||
| ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") | ||||
|  | ||||
|  | ||||
| # This procedure does however not assign an actual length to a root edge, and | ||||
| # therefore no root edge is visible on the plot. Why? , you might ask. I ask | ||||
| # myself that too. We'll just add a length by hand. | ||||
|  | ||||
| apsTree$root.edge <- mean(apsTree$edge.length) * 1.5 | ||||
| plot(apsTree, cex = 0.7, root.edge = TRUE) | ||||
| ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") | ||||
|  | ||||
|  | ||||
| # ==   3.2  Rotating Clades  =================================================== | ||||
|  | ||||
| # To interpret the tree, it is useful to rotate the clades so that they appear | ||||
| # in the order expected from the cladogram of species. | ||||
|  | ||||
| # We can either rotate around individual internal nodes ... | ||||
| layout(matrix(1:2, 1, 2)) | ||||
| plot(apsTree, no.margin = TRUE, root.edge = TRUE) | ||||
| ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866") | ||||
| plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE) | ||||
| ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66") | ||||
| # Note that the species at the bottom of the clade descending from node | ||||
| # 17 is now plotted at the top. | ||||
|  | ||||
| par(PAR)   # reset graphics state | ||||
|  | ||||
| # ... or we can rearrange the tree so it corresponds as well as possible to a | ||||
| # predefined tip ordering. Here we use the ordering that taxize:: has inferred | ||||
| # from the NCBI taxonomic classification. | ||||
|  | ||||
| nOrg <- length(apsTree$tip.label) | ||||
|  | ||||
| plot(fungiTree, | ||||
|      no.margin = FALSE, root.edge = TRUE) | ||||
| ape::nodelabels(text = fungiTree$node.label, | ||||
|                 cex = 0.5, | ||||
|                 adj = 0.2, | ||||
|                 bg = "#D4F2DA") | ||||
|  | ||||
| # These are the fungi tree tips ... | ||||
| fungiTree$tip.label | ||||
| # ... and their order is determined by the edge-list that is stored in | ||||
| fungiTree$edge | ||||
| # which edges join the tips? | ||||
| ape::tiplabels(cex = 0.5, frame = "rect") | ||||
| # as you can see, the tips (range [1:nOrg] ) are in column 2 and they are | ||||
| # ordered from bottom to top. | ||||
| # And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ... | ||||
|  | ||||
| sel <- fungiTree$edge[ , 2 ] <= nOrg | ||||
| ( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] ) | ||||
|  | ||||
| # Now, here are the genes of the apsTree tips ... | ||||
| apsTree$tip.label | ||||
|  | ||||
| # ... and the "constraint"  we need for reordering, according to the help page | ||||
| # of ape::rotateConstr(), is "a vector specifying the order of the tips as they | ||||
| # should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector | ||||
| oSp <- gsub("^", "MBP1_", oSp) | ||||
| ( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) ) | ||||
|  | ||||
| # Then we can plot the two trees to compare: the fungi- tree | ||||
| par(PAR)   # reset graphics state | ||||
| layout(matrix(1:2, 1, 2)) | ||||
| plot(fungiTree, | ||||
|     no.margin = TRUE, | ||||
|      root.edge = TRUE) | ||||
| ape::nodelabels(text = fungiTree$node.label, | ||||
|                 cex = 0.5, | ||||
|                 adj = 0.2, | ||||
|                 bg = "#D4F2DA") | ||||
|  | ||||
| # and the re-organized apsesTree ... | ||||
| plot(ape::rotateConstr(apsTree, constraint = oSp[]), | ||||
|      no.margin = TRUE, | ||||
|      root.edge = TRUE) | ||||
|  | ||||
| par(PAR)   # reset graphics state | ||||
|  | ||||
| # As you can see, the reordering is not perfect, since the topologies are | ||||
| # different, mostly due to the unresolved nodes in the reference tree. One | ||||
| # could play with that ... | ||||
|  | ||||
|  | ||||
| # Task: Study the two trees and consider their similarities and differences. | ||||
| #         What do you expect? What do you find? Note that this is not a "mixed" | ||||
| #         gene tree yet, since it contains only a single gene for the species | ||||
| #         we considered. All of the branch points in this tree are speciation | ||||
| #         events. Thus the gene tree should have the same topology as the | ||||
| #         species tree. Does it? Are the differences important? How many | ||||
| #         branches would you need to remove and reinsert elsewhere to get the | ||||
| #         same topology as the species tree? | ||||
|  | ||||
| # In order to quantify how different these two trees are, we need to compute | ||||
| # tree distances. | ||||
|  | ||||
|  | ||||
| # ==   3.3  Computing tree distances  ========================================== | ||||
|  | ||||
|  | ||||
| # Many superb phylogeny tools are contributed by the phangorn package. | ||||
|  | ||||
| if (! requireNamespace("phangorn", quietly = TRUE)) { | ||||
|   install.packages("phangorn") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = phangorn)       # basic information | ||||
| #  browseVignettes("phangorn")    # available vignettes | ||||
| #  data(package = "phangorn")     # available datasets | ||||
|  | ||||
| # To compare two trees, they must have the same tip labels. We delete "MBP1_" or | ||||
| # "KILA_" from the existing tip labels in a copy of our APSES domain tree. | ||||
| apsTree2 <- apsTree | ||||
| apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label) | ||||
|  | ||||
|  | ||||
| # phangorn provides several functions to compute tree-differences (and there | ||||
| # is a _whole_ lot of theory on how to compare trees). treedist() returns the | ||||
| # "symmetric difference" | ||||
| phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE) | ||||
|  | ||||
| # Numbers. What do they mean? How much more similar is our apsTree to the | ||||
| # (presumably) ground truth of fungiTree than a random tree would be? | ||||
| # The ape package provides the function rtree() | ||||
| # to compute random trees. | ||||
|  | ||||
| ape::rtree(n = length(apsTree2$tip.label), # number of tips | ||||
|           rooted = TRUE,                   # we rooted the tree above, | ||||
|                                            #  and fungiTree is rooted anyway | ||||
|           tip.label = apsTree2$tip.label,  # use the apsTree2 labels | ||||
|           br = NULL)                       # don't generate branch lengths since | ||||
|                                            #   fungiTree has none, so we can't | ||||
|                                            #   compare them anyway. | ||||
|  | ||||
| # (Note the warning message about non-binary trees; we'll suppress that later | ||||
| #  by wrapping the function call in supressMessages(); we don't want to | ||||
| #  print it 10,000 times :-) | ||||
|  | ||||
|  | ||||
| # Let's compute some random trees this way, calculate the distances to | ||||
| # fungiTree, and then compare the values we get for apsTree2. The random | ||||
| # trees are provided by ape::rtree(). | ||||
|  | ||||
| N <- 10000  # takes about 15 seconds, and we'll use the pBar function, | ||||
|             # defined in .utilities.R  to keep track of where we are at: | ||||
| myTreeDistances <- matrix(numeric(N * 2), ncol = 2) | ||||
| colnames(myTreeDistances) <- c("symm", "path") | ||||
|  | ||||
| set.seed(112358) | ||||
| for (i in 1:N) { | ||||
|   pBar(i, N) | ||||
|   xTree <- ape::rtree(n = length(apsTree2$tip.label), | ||||
|                       rooted = TRUE, | ||||
|                       tip.label = apsTree2$tip.label, | ||||
|                       br = NULL) | ||||
|   myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree)) | ||||
| } | ||||
| set.seed(NULL)                      # reset the random number generator | ||||
|  | ||||
| table(myTreeDistances[, "symm"]) | ||||
|  | ||||
| ( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] ) | ||||
|  | ||||
| # Random events less-or-equal to observation, divided by total number of | ||||
| # events gives us the empirical p-value. | ||||
| cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n", | ||||
|             (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1))) | ||||
|  | ||||
| par(PAR)   # reset graphics state | ||||
| hist(myTreeDistances[, "path"], | ||||
|      col = "aliceblue", | ||||
|      main = "Distances of random Trees to fungiTree") | ||||
| (pathObs <- phangorn::treedist(fungiTree, apsTree2)[2]) | ||||
| abline(v = pathObs, col = "chartreuse") | ||||
|  | ||||
| # Random events less-or-equal to observation, divided by total number of | ||||
| # events gives us the empirical p-value. | ||||
| cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n", | ||||
|             (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1))) | ||||
|  | ||||
| # Indeed, our apsTree is _very_ much more similar to the species tree than | ||||
| # we would expect by random chance. | ||||
|  | ||||
| # What do we gain from that analysis? Analyzing the tree we get from a single | ||||
| # gene of orthologous sequences is a positive control in our computational | ||||
| # experiment. If these genes are indeed orthologues, a correct tree-building | ||||
| # program ought to give us a tree that exactly matches the species tree. | ||||
| # Evaluating how far off we are from the known correct result gives us a way to | ||||
| # validate our workflow and our algorithm. If we can't get that right, we can't | ||||
| # expect to get "real" data right either. Employing such positive controls in | ||||
| # every computational experiment is essential for research. Not doing so is | ||||
| # Cargo Cult Bioinformatics. | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-PHYLO-Tree_analysis.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-PHYLO-Tree_analysis unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 updates. Deprecate iTol and use taxize:: instead. | ||||
| #                  Rewrite of tip re-ordering. Better handling of | ||||
| #                  messages. pBar() for randomization. | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0.2  Typo in variable name, style changes | ||||
| #           1.0.1  Wrong section heading | ||||
| #           1.0    First 2017 version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                              Line | ||||
| #TOC> -------------------------------------------------- | ||||
| #TOC>   1        Preparation and Tree Plot            50 | ||||
| #TOC>   2        SPECIES REFERENCE TREE               66 | ||||
| #TOC>   3        Tree Analysis                       117 | ||||
| #TOC>   3.1        Rooting Trees                     177 | ||||
| #TOC>   3.2        Rotating Clades                   222 | ||||
| #TOC>   3.3        Computing tree distances          309 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Preparation and Tree Plot  =========================================== | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("ape", quietly = TRUE)) { | ||||
|   install.packages("ape") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = ape)       # basic information | ||||
| #  browseVignettes("ape")    # available vignettes | ||||
| #  data(package = "ape")     # available datasets | ||||
|  | ||||
| # We change the graphics parameters from time to time, let's define the | ||||
| # default so we can recreate a sane state: | ||||
| dev.off() | ||||
| PAR <- par() | ||||
|  | ||||
| # =    2  SPECIES REFERENCE TREE  ============================================== | ||||
|  | ||||
| # Before we do any kind of phylogenetic analysis of genes from several species, | ||||
| # we MUST have a reference tree of the taxonomic relationships in hand. This | ||||
| # context is absolutely required for the interpretation of our tree. | ||||
|  | ||||
| # We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package. | ||||
|  | ||||
| if (! requireNamespace("taxize", quietly = TRUE)) { | ||||
|   install.packages("taxize") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help   = taxize)       # basic information | ||||
| #  browseVignettes("taxize")    # available vignettes | ||||
| #  data(package  = "taxize")     # available datasets | ||||
|  | ||||
| ( mySOI <- c(myDB$taxonomy$ID, "83333") ) | ||||
| myClass <- taxize::classification(mySOI, db = "ncbi") | ||||
| str(myClass) | ||||
|  | ||||
| myClass[[1]] | ||||
|  | ||||
| fungiTree <- taxize::class2tree(myClass, check = TRUE) | ||||
| plot(fungiTree) | ||||
|  | ||||
| # The tree produced by taxize:: contains full length species names, | ||||
| # but it would be more convenient if it had bicodes instead. Also, the actual | ||||
| # tree is only part of the list(), which will cause problems later: | ||||
| str(fungiTree) | ||||
|  | ||||
| # we therefor simplify | ||||
| fungiTree <- fungiTree$phylo | ||||
| str(fungiTree) | ||||
|  | ||||
| # The species names are in a vector $phylo$tip.label of this list. | ||||
| # We can use biCode() to shorten them. | ||||
| fungiTree$tip.label <- biCode(fungiTree$tip.label) | ||||
|  | ||||
| # Plot the tree | ||||
| nSP <- length(fungiTree$tip.label) | ||||
| plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE) | ||||
| text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4) | ||||
| ape::nodelabels(text = fungiTree$node.label, | ||||
|                 cex = 0.6, | ||||
|                 adj = 0.2, | ||||
|                 bg = "#D4F2DA") | ||||
| # Note that you can use the arrow buttons in the menu above the plot pane to | ||||
| # scroll back to plots you have created earlier - so you can reference back to | ||||
| # this species tree in your later analysis. | ||||
|  | ||||
|  | ||||
| # =    3  Tree Analysis  ======================================================= | ||||
|  | ||||
|  | ||||
| # 1.1  Visualizing your tree | ||||
| # The trees that are produced by Rphylip are stored as an object of class | ||||
| # "phylo". This is a class for phylogenetic trees that is widely used in the | ||||
| # community, practically all R phylogenetics packages will options to read and | ||||
| # manipulate such trees. Outside of R, a popular interchange format is the | ||||
| # Newick_format that you have seen above. It's easy to output your calculated | ||||
| # trees in Newick format and visualize them elsewhere. | ||||
|  | ||||
| # The "phylo" class object is one of R's "S3" objects and methods to plot and | ||||
| # print it have been defined with the Rphylip package, and in ape. You can | ||||
| # simply call plot(<your-tree>) and R knows what to do with <your-tree> and how | ||||
| # to plot it. The underlying function is plot.phylo(), and documentation for its | ||||
| # many options can by found by typing: | ||||
|  | ||||
| ?plot.phylo | ||||
|  | ||||
| # We load the APSES sequence tree that you produced in the | ||||
| # BIN-PHYLO-Tree_building unit: | ||||
| apsTree <- readRDS(file = "data/APSEStreeRproml.rds") | ||||
|  | ||||
| plot(apsTree) # default type is "phylogram" | ||||
| plot(apsTree, type = "unrooted") | ||||
| plot(apsTree, type = "fan", no.margin = TRUE) | ||||
|  | ||||
| # rescale to show all of the labels: | ||||
| # record the current plot parameters by assigning them to a variable ... | ||||
| (tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE)) | ||||
| # ... and adjust the plot limits for a new plot: | ||||
| plot(apsTree, | ||||
|      type = "fan", | ||||
|      x.lim = tmp$x.lim * 1.8, | ||||
|      y.lim = tmp$y.lim * 1.8, | ||||
|      cex = 0.8, | ||||
|      no.margin = TRUE) | ||||
|  | ||||
| # Inspect the tree object | ||||
| str(apsTree) | ||||
| apsTree$tip.label | ||||
| apsTree$edge | ||||
| apsTree$edge.length | ||||
|  | ||||
| # show the node / edge and tip labels on a plot | ||||
| plot(apsTree) | ||||
| ape::nodelabels() | ||||
| ape::edgelabels() | ||||
| ape::tiplabels() | ||||
|  | ||||
| # show the number of nodes, edges and tips | ||||
| ape::Nnode(apsTree) | ||||
| ape::Nedge(apsTree) | ||||
| ape::Ntip(apsTree) | ||||
|  | ||||
| par(PAR)   # reset graphics state | ||||
|  | ||||
| # Finally, write the tree to console in Newick format | ||||
| ape::write.tree(apsTree) | ||||
|  | ||||
| # ==   3.1  Rooting Trees  ===================================================== | ||||
|  | ||||
| # In order to analyse the tree, it is helpful to root it first and reorder its | ||||
| # clades. Contrary to documentation, Rproml() returns an unrooted tree. | ||||
|  | ||||
| ape::is.rooted(apsTree) | ||||
|  | ||||
| # You can root the tree with the command root() from the "ape" package. | ||||
|  | ||||
| plot(apsTree) | ||||
|  | ||||
| # add labels for internal nodes and tips | ||||
| ape::nodelabels(cex = 0.5, frame = "circle") | ||||
| ape::tiplabels(cex = 0.5, frame = "rect") | ||||
|  | ||||
| # The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different | ||||
| # number in yours. Substitute the correct node number below for "outgroup". | ||||
| apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE) | ||||
| plot(apsTree) | ||||
| ape::is.rooted(apsTree) | ||||
|  | ||||
| # This tree _looks_ unchanged, beacuse when the root trifurcation was resolved, | ||||
| # an edge of length zero was added to connect the MRCA (Most Recent Common | ||||
| # Ancestor) of the ingroup. | ||||
|  | ||||
| # The edge lengths are stored in the phylo object: | ||||
| apsTree$edge.length | ||||
|  | ||||
| # ... and you can assign a small arbitrary value to the edge | ||||
| # to show how it connects to the tree without having an | ||||
| # overlap. | ||||
| apsTree$edge.length[1] <- 0.1 | ||||
| plot(apsTree, cex = 0.7) | ||||
| ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") | ||||
|  | ||||
|  | ||||
| # This procedure does however not assign an actual length to a root edge, and | ||||
| # therefore no root edge is visible on the plot. Why? , you might ask. I ask | ||||
| # myself that too. We'll just add a length by hand. | ||||
|  | ||||
| apsTree$root.edge <- mean(apsTree$edge.length) * 1.5 | ||||
| plot(apsTree, cex = 0.7, root.edge = TRUE) | ||||
| ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") | ||||
|  | ||||
|  | ||||
| # ==   3.2  Rotating Clades  =================================================== | ||||
|  | ||||
| # To interpret the tree, it is useful to rotate the clades so that they appear | ||||
| # in the order expected from the cladogram of species. | ||||
|  | ||||
| # We can either rotate around individual internal nodes ... | ||||
| layout(matrix(1:2, 1, 2)) | ||||
| plot(apsTree, no.margin = TRUE, root.edge = TRUE) | ||||
| ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866") | ||||
| plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE) | ||||
| ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66") | ||||
| # Note that the species at the bottom of the clade descending from node | ||||
| # 17 is now plotted at the top. | ||||
|  | ||||
| par(PAR)   # reset graphics state | ||||
|  | ||||
| # ... or we can rearrange the tree so it corresponds as well as possible to a | ||||
| # predefined tip ordering. Here we use the ordering that taxize:: has inferred | ||||
| # from the NCBI taxonomic classification. | ||||
|  | ||||
| nOrg <- length(apsTree$tip.label) | ||||
|  | ||||
| plot(fungiTree, | ||||
|      no.margin = FALSE, root.edge = TRUE) | ||||
| ape::nodelabels(text = fungiTree$node.label, | ||||
|                 cex = 0.5, | ||||
|                 adj = 0.2, | ||||
|                 bg = "#D4F2DA") | ||||
|  | ||||
| # These are the fungi tree tips ... | ||||
| fungiTree$tip.label | ||||
| # ... and their order is determined by the edge-list that is stored in | ||||
| fungiTree$edge | ||||
| # which edges join the tips? | ||||
| ape::tiplabels(cex = 0.5, frame = "rect") | ||||
| # as you can see, the tips (range [1:nOrg] ) are in column 2 and they are | ||||
| # ordered from bottom to top. | ||||
| # And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ... | ||||
|  | ||||
| sel <- fungiTree$edge[ , 2 ] <= nOrg | ||||
| ( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] ) | ||||
|  | ||||
| # Now, here are the genes of the apsTree tips ... | ||||
| apsTree$tip.label | ||||
|  | ||||
| # ... and the "constraint"  we need for reordering, according to the help page | ||||
| # of ape::rotateConstr(), is "a vector specifying the order of the tips as they | ||||
| # should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector | ||||
| oSp <- gsub("^", "MBP1_", oSp) | ||||
| ( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) ) | ||||
|  | ||||
| # Then we can plot the two trees to compare: the fungi- tree | ||||
| par(PAR)   # reset graphics state | ||||
| layout(matrix(1:2, 1, 2)) | ||||
| plot(fungiTree, | ||||
|     no.margin = TRUE, | ||||
|      root.edge = TRUE) | ||||
| ape::nodelabels(text = fungiTree$node.label, | ||||
|                 cex = 0.5, | ||||
|                 adj = 0.2, | ||||
|                 bg = "#D4F2DA") | ||||
|  | ||||
| # and the re-organized apsesTree ... | ||||
| plot(ape::rotateConstr(apsTree, constraint = oSp[]), | ||||
|      no.margin = TRUE, | ||||
|      root.edge = TRUE) | ||||
|  | ||||
| par(PAR)   # reset graphics state | ||||
|  | ||||
| # As you can see, the reordering is not perfect, since the topologies are | ||||
| # different, mostly due to the unresolved nodes in the reference tree. One | ||||
| # could play with that ... | ||||
|  | ||||
|  | ||||
| # Task: Study the two trees and consider their similarities and differences. | ||||
| #         What do you expect? What do you find? Note that this is not a "mixed" | ||||
| #         gene tree yet, since it contains only a single gene for the species | ||||
| #         we considered. All of the branch points in this tree are speciation | ||||
| #         events. Thus the gene tree should have the same topology as the | ||||
| #         species tree. Does it? Are the differences important? How many | ||||
| #         branches would you need to remove and reinsert elsewhere to get the | ||||
| #         same topology as the species tree? | ||||
|  | ||||
| # In order to quantify how different these two trees are, we need to compute | ||||
| # tree distances. | ||||
|  | ||||
|  | ||||
| # ==   3.3  Computing tree distances  ========================================== | ||||
|  | ||||
|  | ||||
| # Many superb phylogeny tools are contributed by the phangorn package. | ||||
|  | ||||
| if (! requireNamespace("phangorn", quietly = TRUE)) { | ||||
|   install.packages("phangorn") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = phangorn)       # basic information | ||||
| #  browseVignettes("phangorn")    # available vignettes | ||||
| #  data(package = "phangorn")     # available datasets | ||||
|  | ||||
| # To compare two trees, they must have the same tip labels. We delete "MBP1_" or | ||||
| # "KILA_" from the existing tip labels in a copy of our APSES domain tree. | ||||
| apsTree2 <- apsTree | ||||
| apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label) | ||||
|  | ||||
|  | ||||
| # phangorn provides several functions to compute tree-differences (and there | ||||
| # is a _whole_ lot of theory on how to compare trees). treedist() returns the | ||||
| # "symmetric difference" | ||||
| phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE) | ||||
|  | ||||
| # Numbers. What do they mean? How much more similar is our apsTree to the | ||||
| # (presumably) ground truth of fungiTree than a random tree would be? | ||||
| # The ape package provides the function rtree() | ||||
| # to compute random trees. | ||||
|  | ||||
| ape::rtree(n = length(apsTree2$tip.label), # number of tips | ||||
|           rooted = TRUE,                   # we rooted the tree above, | ||||
|                                            #  and fungiTree is rooted anyway | ||||
|           tip.label = apsTree2$tip.label,  # use the apsTree2 labels | ||||
|           br = NULL)                       # don't generate branch lengths since | ||||
|                                            #   fungiTree has none, so we can't | ||||
|                                            #   compare them anyway. | ||||
|  | ||||
| # (Note the warning message about non-binary trees; we'll suppress that later | ||||
| #  by wrapping the function call in supressMessages(); we don't want to | ||||
| #  print it 10,000 times :-) | ||||
|  | ||||
|  | ||||
| # Let's compute some random trees this way, calculate the distances to | ||||
| # fungiTree, and then compare the values we get for apsTree2. The random | ||||
| # trees are provided by ape::rtree(). | ||||
|  | ||||
| N <- 10000  # takes about 15 seconds, and we'll use the pBar function, | ||||
|             # defined in .utilities.R  to keep track of where we are at: | ||||
| myTreeDistances <- matrix(numeric(N * 2), ncol = 2) | ||||
| colnames(myTreeDistances) <- c("symm", "path") | ||||
|  | ||||
| set.seed(112358) | ||||
| for (i in 1:N) { | ||||
|   pBar(i, N) | ||||
|   xTree <- ape::rtree(n = length(apsTree2$tip.label), | ||||
|                       rooted = TRUE, | ||||
|                       tip.label = apsTree2$tip.label, | ||||
|                       br = NULL) | ||||
|   myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree)) | ||||
| } | ||||
| set.seed(NULL)                      # reset the random number generator | ||||
|  | ||||
| table(myTreeDistances[, "symm"]) | ||||
|  | ||||
| ( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] ) | ||||
|  | ||||
| # Random events less-or-equal to observation, divided by total number of | ||||
| # events gives us the empirical p-value. | ||||
| cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n", | ||||
|             (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1))) | ||||
|  | ||||
| par(PAR)   # reset graphics state | ||||
| hist(myTreeDistances[, "path"], | ||||
|      col = "aliceblue", | ||||
|      main = "Distances of random Trees to fungiTree") | ||||
| (pathObs <- phangorn::treedist(fungiTree, apsTree2)[2]) | ||||
| abline(v = pathObs, col = "chartreuse") | ||||
|  | ||||
| # Random events less-or-equal to observation, divided by total number of | ||||
| # events gives us the empirical p-value. | ||||
| cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n", | ||||
|             (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1))) | ||||
|  | ||||
| # Indeed, our apsTree is _very_ much more similar to the species tree than | ||||
| # we would expect by random chance. | ||||
|  | ||||
| # What do we gain from that analysis? Analyzing the tree we get from a single | ||||
| # gene of orthologous sequences is a positive control in our computational | ||||
| # experiment. If these genes are indeed orthologues, a correct tree-building | ||||
| # program ought to give us a tree that exactly matches the species tree. | ||||
| # Evaluating how far off we are from the known correct result gives us a way to | ||||
| # validate our workflow and our algorithm. If we can't get that right, we can't | ||||
| # expect to get "real" data right either. Employing such positive controls in | ||||
| # every computational experiment is essential for research. Not doing so is | ||||
| # Cargo Cult Bioinformatics. | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,168 +1,168 @@ | ||||
| # tocID <- "BIN-PHYLO-Tree_building.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-PHYLO-Tree_building unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10   2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac: | ||||
| #                  instructions to authorize proml.app | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #           1.0    First 2017 version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #           Add MrBayes | ||||
| # https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                       Line | ||||
| #TOC> ----------------------------------------------------------- | ||||
| #TOC>   1        Calculating Trees                             48 | ||||
| #TOC>   1.1        PROMLPATH ...                               68 | ||||
| #TOC>   1.1.1          ... on the Mac                          73 | ||||
| #TOC>   1.1.2          ... on Windows                         101 | ||||
| #TOC>   1.1.3          ... on Linux                           115 | ||||
| #TOC>   1.1.4          Confirming PROMLPATH                   120 | ||||
| #TOC>   1.2        Building a maximum likelihood tree         134 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Calculating Trees  =================================================== | ||||
|  | ||||
|  | ||||
| # Follow the instructions found at phylip's home on the Web to install. If you | ||||
| # are on a Windows computer, take note of the installation directory. | ||||
|  | ||||
| # After you have installed Phylip on your computer, install the R package that | ||||
| # provides an interface to the Phylip functions. | ||||
|  | ||||
| if (! requireNamespace("Rphylip", quietly = TRUE)) { | ||||
|   install.packages("Rphylip") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Rphylip)       # basic information | ||||
| #  browseVignettes("Rphylip")    # available vignettes | ||||
| #  data(package = "Rphylip")     # available datasets | ||||
|  | ||||
| # This will install RPhylip, as well as its dependency, the package "ape". | ||||
|  | ||||
|  | ||||
| # ==   1.1  PROMLPATH ...  ===================================================== | ||||
| # The next part may be tricky. You will need to figure out where | ||||
| # on your computer Phylip has been installed and define the path | ||||
| # to the proml program that calculates a maximum-likelihood tree. | ||||
|  | ||||
| # ===   1.1.1  ... on the Mac                     | ||||
| # On the Mac, the standard installation places a phylip folder | ||||
| # in the /Applications directory. That folder contains all the | ||||
| # individual phylip programs as <name>.app files. These are not | ||||
| # the actual executables, but "app" files are actually directories | ||||
| # that contain the required resources for a program to run. | ||||
|  | ||||
| # The executable is in a subdirectory and you can point Rphylip | ||||
| # directly to that subdirectory to find the program it needs: | ||||
| # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS" | ||||
|  | ||||
| # However, RPHYLIP will not be able to run PHYLIP applications immediately, | ||||
| # because they have not been "signed" by the PHYLIP developers. The process | ||||
| # will terminate by your system, with a warning. | ||||
|  | ||||
| #   -  Navigate to the phylip folder in your ~/Applications directory | ||||
| #   -  Descend into the "exe" folder and find  proml.app | ||||
| #   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that | ||||
| #      says: "macOS cannot verify the developer of “proml.app”. | ||||
| #             Are you sure you want to open it?" | ||||
| #   -  Click open to continue. You may need to allow access to the terminal | ||||
| #      as well. When the proml terminal session open, you can type | ||||
| #      Ctrl-c to abort the program and close the window. | ||||
| # | ||||
| #   This adds proml.app to the list of known-good programs and you will not | ||||
| #   need to repeat this process. | ||||
| # | ||||
|  | ||||
| # ===   1.1.2  ... on Windows                     | ||||
| # On Windows you need to know where the programs have been installed, and you | ||||
| # need to specify a path that is correct for the Windows OS. Find the folder | ||||
| # that is named "exe", and right-click to inspect its properties. The path | ||||
| # should be listed among them. | ||||
|  | ||||
| # If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your | ||||
| # assignment has to be | ||||
| # PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe" | ||||
| # (Note: "/", not "\") | ||||
|  | ||||
| # I have heard that your path must not contain spaces, and it is prudent to | ||||
| # avoid other special characters as well. | ||||
|  | ||||
| # ===   1.1.3  ... on Linux                       | ||||
| # If you are running Linux I trust you know what to do. It's probably | ||||
| # something like | ||||
| # PROMLPATH <- "/usr/local/phylip-3.695/bin" | ||||
|  | ||||
| # ===   1.1.4  Confirming PROMLPATH               | ||||
| # Confirm that the settings are right. | ||||
| PROMLPATH                # returns the path | ||||
| list.dirs(PROMLPATH)     # returns the directories in that path | ||||
| list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command" | ||||
|  | ||||
| # If "proml" is NOT among the files that the last command returns, you | ||||
| # can't continue. Ask on the mailing list for advice. | ||||
|  | ||||
| # If everything is good, you can add the line that defines PROMLPATH to | ||||
| # myScripts/.myProfile.R - the path will then be automatically set when | ||||
| # you quit RStudio and return. | ||||
|  | ||||
|  | ||||
| # ==   1.2  Building a maximum likelihood tree  ================================ | ||||
| # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit, | ||||
| # as a "proseq" object with the read.protein() function of the RPhylip package: | ||||
|  | ||||
| apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa") | ||||
| str(apsIn) | ||||
|  | ||||
| # ... and you are ready to build a tree. | ||||
|  | ||||
| # There are many fast options in PHYLIP - we will use the most _accurate_ one | ||||
| # that it has: proml, a maximum-likelihood tree building program for protein | ||||
| # data. | ||||
|  | ||||
| # Building maximum-likelihood trees can eat as much computer time | ||||
| # as you can throw at it. Calculating a tree of 48 APSES domains | ||||
| # with default parameters of Rproml() runs for more than half a day | ||||
| # on my computer. But we have only twelve sequences here, so the | ||||
| # process will take us about 5 to 15 minutes. Run this, and anjoy a good cup | ||||
| # of coffee while you are waiting. | ||||
|  | ||||
| apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH) | ||||
|  | ||||
| # A quick first look: | ||||
|  | ||||
| plot(apsTree) | ||||
|  | ||||
| # save your tree: | ||||
| saveRDS(apsTree, file = "data/APSEStreeRproml.rds") | ||||
|  | ||||
| # If this did not work, ask for advice. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-PHYLO-Tree_building.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-PHYLO-Tree_building unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10   2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac: | ||||
| #                  instructions to authorize proml.app | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #           1.0    First 2017 version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #           Add MrBayes | ||||
| # https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                       Line | ||||
| #TOC> ----------------------------------------------------------- | ||||
| #TOC>   1        Calculating Trees                             48 | ||||
| #TOC>   1.1        PROMLPATH ...                               68 | ||||
| #TOC>   1.1.1          ... on the Mac                          73 | ||||
| #TOC>   1.1.2          ... on Windows                         101 | ||||
| #TOC>   1.1.3          ... on Linux                           115 | ||||
| #TOC>   1.1.4          Confirming PROMLPATH                   120 | ||||
| #TOC>   1.2        Building a maximum likelihood tree         134 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Calculating Trees  =================================================== | ||||
|  | ||||
|  | ||||
| # Follow the instructions found at phylip's home on the Web to install. If you | ||||
| # are on a Windows computer, take note of the installation directory. | ||||
|  | ||||
| # After you have installed Phylip on your computer, install the R package that | ||||
| # provides an interface to the Phylip functions. | ||||
|  | ||||
| if (! requireNamespace("Rphylip", quietly = TRUE)) { | ||||
|   install.packages("Rphylip") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Rphylip)       # basic information | ||||
| #  browseVignettes("Rphylip")    # available vignettes | ||||
| #  data(package = "Rphylip")     # available datasets | ||||
|  | ||||
| # This will install RPhylip, as well as its dependency, the package "ape". | ||||
|  | ||||
|  | ||||
| # ==   1.1  PROMLPATH ...  ===================================================== | ||||
| # The next part may be tricky. You will need to figure out where | ||||
| # on your computer Phylip has been installed and define the path | ||||
| # to the proml program that calculates a maximum-likelihood tree. | ||||
|  | ||||
| # ===   1.1.1  ... on the Mac                     | ||||
| # On the Mac, the standard installation places a phylip folder | ||||
| # in the /Applications directory. That folder contains all the | ||||
| # individual phylip programs as <name>.app files. These are not | ||||
| # the actual executables, but "app" files are actually directories | ||||
| # that contain the required resources for a program to run. | ||||
|  | ||||
| # The executable is in a subdirectory and you can point Rphylip | ||||
| # directly to that subdirectory to find the program it needs: | ||||
| # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS" | ||||
|  | ||||
| # However, RPHYLIP will not be able to run PHYLIP applications immediately, | ||||
| # because they have not been "signed" by the PHYLIP developers. The process | ||||
| # will terminate by your system, with a warning. | ||||
|  | ||||
| #   -  Navigate to the phylip folder in your ~/Applications directory | ||||
| #   -  Descend into the "exe" folder and find  proml.app | ||||
| #   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that | ||||
| #      says: "macOS cannot verify the developer of “proml.app”. | ||||
| #             Are you sure you want to open it?" | ||||
| #   -  Click open to continue. You may need to allow access to the terminal | ||||
| #      as well. When the proml terminal session open, you can type | ||||
| #      Ctrl-c to abort the program and close the window. | ||||
| # | ||||
| #   This adds proml.app to the list of known-good programs and you will not | ||||
| #   need to repeat this process. | ||||
| # | ||||
|  | ||||
| # ===   1.1.2  ... on Windows                     | ||||
| # On Windows you need to know where the programs have been installed, and you | ||||
| # need to specify a path that is correct for the Windows OS. Find the folder | ||||
| # that is named "exe", and right-click to inspect its properties. The path | ||||
| # should be listed among them. | ||||
|  | ||||
| # If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your | ||||
| # assignment has to be | ||||
| # PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe" | ||||
| # (Note: "/", not "\") | ||||
|  | ||||
| # I have heard that your path must not contain spaces, and it is prudent to | ||||
| # avoid other special characters as well. | ||||
|  | ||||
| # ===   1.1.3  ... on Linux                       | ||||
| # If you are running Linux I trust you know what to do. It's probably | ||||
| # something like | ||||
| # PROMLPATH <- "/usr/local/phylip-3.695/bin" | ||||
|  | ||||
| # ===   1.1.4  Confirming PROMLPATH               | ||||
| # Confirm that the settings are right. | ||||
| PROMLPATH                # returns the path | ||||
| list.dirs(PROMLPATH)     # returns the directories in that path | ||||
| list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command" | ||||
|  | ||||
| # If "proml" is NOT among the files that the last command returns, you | ||||
| # can't continue. Ask on the mailing list for advice. | ||||
|  | ||||
| # If everything is good, you can add the line that defines PROMLPATH to | ||||
| # myScripts/.myProfile.R - the path will then be automatically set when | ||||
| # you quit RStudio and return. | ||||
|  | ||||
|  | ||||
| # ==   1.2  Building a maximum likelihood tree  ================================ | ||||
| # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit, | ||||
| # as a "proseq" object with the read.protein() function of the RPhylip package: | ||||
|  | ||||
| apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa") | ||||
| str(apsIn) | ||||
|  | ||||
| # ... and you are ready to build a tree. | ||||
|  | ||||
| # There are many fast options in PHYLIP - we will use the most _accurate_ one | ||||
| # that it has: proml, a maximum-likelihood tree building program for protein | ||||
| # data. | ||||
|  | ||||
| # Building maximum-likelihood trees can eat as much computer time | ||||
| # as you can throw at it. Calculating a tree of 48 APSES domains | ||||
| # with default parameters of Rproml() runs for more than half a day | ||||
| # on my computer. But we have only twelve sequences here, so the | ||||
| # process will take us about 5 to 15 minutes. Run this, and anjoy a good cup | ||||
| # of coffee while you are waiting. | ||||
|  | ||||
| apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH) | ||||
|  | ||||
| # A quick first look: | ||||
|  | ||||
| plot(apsTree) | ||||
|  | ||||
| # save your tree: | ||||
| saveRDS(apsTree, file = "data/APSEStreeRproml.rds") | ||||
|  | ||||
| # If this did not work, ask for advice. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,323 +1,323 @@ | ||||
| # tocID <- "BIN-PPI-Analysis.R" | ||||
| # | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-PPI-Analysis unit. | ||||
| # | ||||
| # Version:   1.4 | ||||
| # | ||||
| # Date:     2017-08  -  2020-10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.4    Update vector ID's for betweenness centrality. | ||||
| #           1.3    Bugfix: called the wrong function on ENSPsel in l. 220 | ||||
| #           1.2    2020 Updates; Rewrite for new STRINg V11; | ||||
| #                  Deprecate save()/load() for saveRDS()/readRDS() | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0    First live version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                           Line | ||||
| #TOC> --------------------------------------------------------------- | ||||
| #TOC>   1        Setup and data                                    50 | ||||
| #TOC>   2        Functional Edges in the Human Proteome            86 | ||||
| #TOC>   2.1        Cliques                                        129 | ||||
| #TOC>   2.2        Communities                                    170 | ||||
| #TOC>   2.3        Betweenness Centrality                         184 | ||||
| #TOC>   3        biomaRt                                          231 | ||||
| #TOC>   4        Task for submission                              302 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Setup and data  ====================================================== | ||||
|  | ||||
|  | ||||
| # Not surprisingly, the analysis of PPI networks needs iGraph: | ||||
|  | ||||
| if (! requireNamespace("igraph", quietly = TRUE)) { | ||||
|   install.packages("igraph") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = igraph)       # basic information | ||||
| #  browseVignettes("igraph")    # available vignettes | ||||
| #  data(package = "igraph")     # available datasets | ||||
|  | ||||
| # In order for you to explore some real, biological networks, I give you a | ||||
| # dataframe of functional relationships of human proteins that I have downloaded | ||||
| # from the STRING database. The full table has 8.5 million records, here is a | ||||
| # subset of records with combined confidence scores > 980 | ||||
|  | ||||
| # The selected set of edges with a confidence of > 964 is a dataframe with about | ||||
| # 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of | ||||
| # a fungal proteome. You can load the saved dataframe here (To read more about | ||||
| # what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ). | ||||
|  | ||||
| STRINGedges <- readRDS("./data/STRINGedges.rds") | ||||
|  | ||||
| head(STRINGedges) | ||||
|  | ||||
| # Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the | ||||
| # Ensemble transcript identifiers that start with ENSP. We'll remove them: | ||||
|  | ||||
| STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a) | ||||
| STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b) | ||||
|  | ||||
| head(STRINGedges) | ||||
|  | ||||
|  | ||||
| # =    2  Functional Edges in the Human Proteome  ============================== | ||||
|  | ||||
|  | ||||
| # There are many possibilities to explore interesting aspects of biological | ||||
| # networks, we will keep with some very simple procedures here but you have | ||||
| # to be aware that this is barely scratching the surface of possibilities. | ||||
| # However, once the network exists in your computer, it is comparatively | ||||
| # easy to find information online about the many, many options to analyze. | ||||
|  | ||||
|  | ||||
| # Make a graph from this dataframe | ||||
| ?igraph::graph_from_data_frame | ||||
|  | ||||
| gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE) | ||||
|  | ||||
| # CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges - | ||||
| # layout of such large graphs is possible, but requires specialized code. Google | ||||
| # for <layout large graphs> if you are curious. Also, consider what one can | ||||
| # really learn from plotting such a graph ... | ||||
|  | ||||
| # Of course simple computations on this graph are reasonably fast: | ||||
|  | ||||
| compSTR <- igraph::components(gSTR) | ||||
| summary(compSTR) # our graph is fully connected! | ||||
|  | ||||
| hist(log(igraph::degree(gSTR)), col="#FEE0AF") | ||||
| # this actually does look rather scale-free | ||||
|  | ||||
| (freqRank <- table(igraph::degree(gSTR))) | ||||
| plot(log10(as.numeric(names(freqRank)) + 1), | ||||
|      log10(as.numeric(freqRank)), type = "b", | ||||
|      pch = 21, bg = "#FEE0AF", | ||||
|      xlab = "log(Rank)", ylab = "log(frequency)", | ||||
|      main = "8,400 nodes from the human functional interaction network") | ||||
|  | ||||
| # This looks very scale-free indeed. | ||||
|  | ||||
| (regressionLine <- lm(log10(as.numeric(freqRank)) ~ | ||||
|                       log10(as.numeric(names(freqRank)) + 1))) | ||||
| abline(regressionLine, col = "firebrick") | ||||
|  | ||||
| # Now explore some more: | ||||
|  | ||||
| # ==   2.1  Cliques  =========================================================== | ||||
|  | ||||
| # Let's find the largest cliques. Remember: a clique is a fully connected | ||||
| # subgraph, i.e. a subgraph in which every node is connected to every other. | ||||
| # Biological complexes often appear as cliques in interaction graphs. | ||||
|  | ||||
| igraph::clique_num(gSTR) | ||||
| # The largest clique has 81 members. | ||||
|  | ||||
| (C <- igraph::largest_cliques(gSTR)[[1]]) | ||||
|  | ||||
| # Pick one of the proteins and find out what this fully connected cluster of 81 | ||||
| # proteins is (you can simply Google for any of the IDs). Is this expected? | ||||
|  | ||||
| # Plot this ... | ||||
| R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices | ||||
|  | ||||
| # color the vertices along a color spectrum | ||||
| vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes | ||||
|  | ||||
| # color the edges to have the same color as the originating node | ||||
| eCol <- character() | ||||
| for (i in seq_along(vCol)) { | ||||
|   eCol <- c(eCol, rep(vCol[i], igraph::gorder(R))) | ||||
| } | ||||
|  | ||||
| oPar <- par(mar= rep(0,4)) # Turn margins off | ||||
| plot(R, | ||||
|      layout = igraph::layout_in_circle(R), | ||||
|      vertex.size = 3, | ||||
|      vertex.color = vCol, | ||||
|      edge.color = eCol, | ||||
|      edge.width = 0.1, | ||||
|      vertex.label = NA) | ||||
| par(oPar) | ||||
|  | ||||
| # ... well: remember: a clique means every node is connected to every other | ||||
| # node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI | ||||
| # networks looks like for large complexes. | ||||
|  | ||||
|  | ||||
| # ==   2.2  Communities  ======================================================= | ||||
|  | ||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | ||||
| gSTRclusters <- igraph::cluster_infomap(gSTR) | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| igraph::modularity(gSTRclusters) # ... measures how separated the different | ||||
|                                  # membership types are from each other | ||||
| tMem <- table(igraph::membership(gSTRclusters)) | ||||
| length(tMem)  # About 700 communities identified | ||||
| hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ... | ||||
| range(tMem) # ... but one has > 200 members | ||||
|  | ||||
|  | ||||
| # ==   2.3  Betweenness Centrality  ============================================ | ||||
|  | ||||
| # Let's find the nodes with the 10 - highest betweenness centralities. | ||||
| # | ||||
| BC <- igraph::centr_betw(gSTR) | ||||
|  | ||||
| # remember: BC$res contains the results | ||||
| head(BC$res) | ||||
|  | ||||
| BC$res[1]   # betweenness centrality of node 1 in the graph ... | ||||
| # ... which one is node 1? | ||||
| igraph::V(gSTR)[1] | ||||
|  | ||||
| # to get the ten-highest nodes, we simply label the elements of BC with their | ||||
| # index ... | ||||
| names(BC$res) <- as.character(1:length(BC$res)) | ||||
|  | ||||
| # ... and then we sort: | ||||
| sBC <- sort(BC$res, decreasing = TRUE) | ||||
| head(sBC) | ||||
|  | ||||
| # This ordered vector means: node 3 has the highest betweenness centrality, | ||||
| # node 721 has the second highest, etc. | ||||
|  | ||||
| (BCsel <- as.numeric(names(sBC)[1:10])) | ||||
|  | ||||
| # We can use the first ten labels to subset the nodes in gSTR and fetch the | ||||
| # IDs... | ||||
| (ENSPsel <- names(igraph::V(gSTR)[BCsel])) | ||||
|  | ||||
| # Task: | ||||
| # ===== | ||||
| # IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT | ||||
| # We are going to use these IDs to produce some output for a submitted task: | ||||
| # therefore I need you to execute the following line, note the "seal" that this | ||||
| # returns, and not change myENSPsel later: | ||||
|  | ||||
| myENSPsel <- selectENSP(ENSPsel) | ||||
|  | ||||
| #  Next, to find what these proteins are... | ||||
|  | ||||
| # We could now Google for all of these IDs to learn more about them. But really, | ||||
| # googling for IDs one after the other, that would be lame. Let's instead use | ||||
| # the very, very useful biomaRt package to translate these Ensemble IDs into | ||||
| # gene symbols. | ||||
|  | ||||
|  | ||||
| # =    3  biomaRt  ============================================================= | ||||
|  | ||||
|  | ||||
| # IDs are just labels, but for _bio_informatics we need to learn more about the | ||||
| # biological function of the genes or proteins that we retrieve via graph data | ||||
| # mining. biomaRt is the tool of choice. It's a package distributed by the | ||||
| # bioconductor project. This here is not a biomaRt tutorial (that's for another | ||||
| # day), simply a few lines of sample code to get you started on the specific use | ||||
| # case of retrieving descriptions for ensembl protein IDs. | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("biomaRt", quietly = TRUE)) { | ||||
|   BiocManager::install("biomaRt") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = biomaRt)       # basic information | ||||
| #  browseVignettes("biomaRt")    # available vignettes | ||||
| #  data(package = "biomaRt")     # available datasets | ||||
|  | ||||
| # define which dataset to use ... this takes a while for download | ||||
| myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl") | ||||
|  | ||||
| # what filters are defined? | ||||
| ( filters <- biomaRt::listFilters(myMart) ) | ||||
|  | ||||
|  | ||||
| # and what attributes can we filter for? | ||||
| ( attributes <- biomaRt::listAttributes(myMart) ) | ||||
|  | ||||
|  | ||||
| # Soooo many options - let's look for the correct name of filters that are | ||||
| # useful for ENSP IDs ... | ||||
| filters[grep("ENSP", filters$description), ] | ||||
|  | ||||
| # ... and the correct attribute names for gene symbols and descriptions ... | ||||
| attributes[grep("symbol", attributes$description, ignore.case = TRUE), ] | ||||
| attributes[grep("description", attributes$description, ignore.case = TRUE), ] | ||||
|  | ||||
|  | ||||
| # ... so we can put this together: here is a syntax example: | ||||
| biomaRt::getBM(filters = "ensembl_peptide_id", | ||||
|                attributes = c("hgnc_symbol", | ||||
|                               "wikigene_description", | ||||
|                               "interpro_description", | ||||
|                               "phenotype_description"), | ||||
|                values = "ENSP00000000442", | ||||
|                mart = myMart) | ||||
|  | ||||
| # A simple loop will now get us the information for our 10 most central genes | ||||
| # from the human subset of STRING. | ||||
|  | ||||
| CPdefs <- list()  # Since we don't know how many matches one of our queries | ||||
| # will return, we'll put the result dataframes into a list. | ||||
|  | ||||
| for (ID in myENSPsel) { | ||||
|   CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id", | ||||
|                                  attributes = c("hgnc_symbol", | ||||
|                                                 "wikigene_description", | ||||
|                                                 "interpro_description", | ||||
|                                                 "phenotype_description"), | ||||
|                                  values = ID, | ||||
|                                  mart = myMart) | ||||
| } | ||||
|  | ||||
|  | ||||
| # So what are the proteins with the ten highest betweenness centralities? | ||||
| #  ... are you surprised? (I am! Really.) | ||||
|  | ||||
|  | ||||
| # =    4  Task for submission  ================================================= | ||||
|  | ||||
| # Write a loop that will go through your personalized list of Ensemble IDs and | ||||
| #    for each ID: | ||||
| #    --  print the ID, | ||||
| #    --  print the first row's HGNC symbol, | ||||
| #    --  print the first row's wikigene description. | ||||
| #    --  print the first row's phenotype. | ||||
| # | ||||
| # Write your thoughts about this group of genes. | ||||
| # | ||||
| # (Hint, you can structure your loop in the same way as the loop that | ||||
| # created CPdefs. ) | ||||
|  | ||||
| # Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code | ||||
| # for this loop and its output into your report if you are submitting | ||||
| # anything for credit for this unit. Please read the requirements carefully. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-PPI-Analysis.R" | ||||
| # | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-PPI-Analysis unit. | ||||
| # | ||||
| # Version:   1.4 | ||||
| # | ||||
| # Date:     2017-08  -  2020-10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.4    Update vector ID's for betweenness centrality. | ||||
| #           1.3    Bugfix: called the wrong function on ENSPsel in l. 220 | ||||
| #           1.2    2020 Updates; Rewrite for new STRINg V11; | ||||
| #                  Deprecate save()/load() for saveRDS()/readRDS() | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0    First live version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                           Line | ||||
| #TOC> --------------------------------------------------------------- | ||||
| #TOC>   1        Setup and data                                    50 | ||||
| #TOC>   2        Functional Edges in the Human Proteome            86 | ||||
| #TOC>   2.1        Cliques                                        129 | ||||
| #TOC>   2.2        Communities                                    170 | ||||
| #TOC>   2.3        Betweenness Centrality                         184 | ||||
| #TOC>   3        biomaRt                                          231 | ||||
| #TOC>   4        Task for submission                              302 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Setup and data  ====================================================== | ||||
|  | ||||
|  | ||||
| # Not surprisingly, the analysis of PPI networks needs iGraph: | ||||
|  | ||||
| if (! requireNamespace("igraph", quietly = TRUE)) { | ||||
|   install.packages("igraph") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = igraph)       # basic information | ||||
| #  browseVignettes("igraph")    # available vignettes | ||||
| #  data(package = "igraph")     # available datasets | ||||
|  | ||||
| # In order for you to explore some real, biological networks, I give you a | ||||
| # dataframe of functional relationships of human proteins that I have downloaded | ||||
| # from the STRING database. The full table has 8.5 million records, here is a | ||||
| # subset of records with combined confidence scores > 980 | ||||
|  | ||||
| # The selected set of edges with a confidence of > 964 is a dataframe with about | ||||
| # 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of | ||||
| # a fungal proteome. You can load the saved dataframe here (To read more about | ||||
| # what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ). | ||||
|  | ||||
| STRINGedges <- readRDS("./data/STRINGedges.rds") | ||||
|  | ||||
| head(STRINGedges) | ||||
|  | ||||
| # Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the | ||||
| # Ensemble transcript identifiers that start with ENSP. We'll remove them: | ||||
|  | ||||
| STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a) | ||||
| STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b) | ||||
|  | ||||
| head(STRINGedges) | ||||
|  | ||||
|  | ||||
| # =    2  Functional Edges in the Human Proteome  ============================== | ||||
|  | ||||
|  | ||||
| # There are many possibilities to explore interesting aspects of biological | ||||
| # networks, we will keep with some very simple procedures here but you have | ||||
| # to be aware that this is barely scratching the surface of possibilities. | ||||
| # However, once the network exists in your computer, it is comparatively | ||||
| # easy to find information online about the many, many options to analyze. | ||||
|  | ||||
|  | ||||
| # Make a graph from this dataframe | ||||
| ?igraph::graph_from_data_frame | ||||
|  | ||||
| gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE) | ||||
|  | ||||
| # CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges - | ||||
| # layout of such large graphs is possible, but requires specialized code. Google | ||||
| # for <layout large graphs> if you are curious. Also, consider what one can | ||||
| # really learn from plotting such a graph ... | ||||
|  | ||||
| # Of course simple computations on this graph are reasonably fast: | ||||
|  | ||||
| compSTR <- igraph::components(gSTR) | ||||
| summary(compSTR) # our graph is fully connected! | ||||
|  | ||||
| hist(log(igraph::degree(gSTR)), col="#FEE0AF") | ||||
| # this actually does look rather scale-free | ||||
|  | ||||
| (freqRank <- table(igraph::degree(gSTR))) | ||||
| plot(log10(as.numeric(names(freqRank)) + 1), | ||||
|      log10(as.numeric(freqRank)), type = "b", | ||||
|      pch = 21, bg = "#FEE0AF", | ||||
|      xlab = "log(Rank)", ylab = "log(frequency)", | ||||
|      main = "8,400 nodes from the human functional interaction network") | ||||
|  | ||||
| # This looks very scale-free indeed. | ||||
|  | ||||
| (regressionLine <- lm(log10(as.numeric(freqRank)) ~ | ||||
|                       log10(as.numeric(names(freqRank)) + 1))) | ||||
| abline(regressionLine, col = "firebrick") | ||||
|  | ||||
| # Now explore some more: | ||||
|  | ||||
| # ==   2.1  Cliques  =========================================================== | ||||
|  | ||||
| # Let's find the largest cliques. Remember: a clique is a fully connected | ||||
| # subgraph, i.e. a subgraph in which every node is connected to every other. | ||||
| # Biological complexes often appear as cliques in interaction graphs. | ||||
|  | ||||
| igraph::clique_num(gSTR) | ||||
| # The largest clique has 81 members. | ||||
|  | ||||
| (C <- igraph::largest_cliques(gSTR)[[1]]) | ||||
|  | ||||
| # Pick one of the proteins and find out what this fully connected cluster of 81 | ||||
| # proteins is (you can simply Google for any of the IDs). Is this expected? | ||||
|  | ||||
| # Plot this ... | ||||
| R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices | ||||
|  | ||||
| # color the vertices along a color spectrum | ||||
| vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes | ||||
|  | ||||
| # color the edges to have the same color as the originating node | ||||
| eCol <- character() | ||||
| for (i in seq_along(vCol)) { | ||||
|   eCol <- c(eCol, rep(vCol[i], igraph::gorder(R))) | ||||
| } | ||||
|  | ||||
| oPar <- par(mar= rep(0,4)) # Turn margins off | ||||
| plot(R, | ||||
|      layout = igraph::layout_in_circle(R), | ||||
|      vertex.size = 3, | ||||
|      vertex.color = vCol, | ||||
|      edge.color = eCol, | ||||
|      edge.width = 0.1, | ||||
|      vertex.label = NA) | ||||
| par(oPar) | ||||
|  | ||||
| # ... well: remember: a clique means every node is connected to every other | ||||
| # node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI | ||||
| # networks looks like for large complexes. | ||||
|  | ||||
|  | ||||
| # ==   2.2  Communities  ======================================================= | ||||
|  | ||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | ||||
| gSTRclusters <- igraph::cluster_infomap(gSTR) | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| igraph::modularity(gSTRclusters) # ... measures how separated the different | ||||
|                                  # membership types are from each other | ||||
| tMem <- table(igraph::membership(gSTRclusters)) | ||||
| length(tMem)  # About 700 communities identified | ||||
| hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ... | ||||
| range(tMem) # ... but one has > 200 members | ||||
|  | ||||
|  | ||||
| # ==   2.3  Betweenness Centrality  ============================================ | ||||
|  | ||||
| # Let's find the nodes with the 10 - highest betweenness centralities. | ||||
| # | ||||
| BC <- igraph::centr_betw(gSTR) | ||||
|  | ||||
| # remember: BC$res contains the results | ||||
| head(BC$res) | ||||
|  | ||||
| BC$res[1]   # betweenness centrality of node 1 in the graph ... | ||||
| # ... which one is node 1? | ||||
| igraph::V(gSTR)[1] | ||||
|  | ||||
| # to get the ten-highest nodes, we simply label the elements of BC with their | ||||
| # index ... | ||||
| names(BC$res) <- as.character(1:length(BC$res)) | ||||
|  | ||||
| # ... and then we sort: | ||||
| sBC <- sort(BC$res, decreasing = TRUE) | ||||
| head(sBC) | ||||
|  | ||||
| # This ordered vector means: node 3 has the highest betweenness centrality, | ||||
| # node 721 has the second highest, etc. | ||||
|  | ||||
| (BCsel <- as.numeric(names(sBC)[1:10])) | ||||
|  | ||||
| # We can use the first ten labels to subset the nodes in gSTR and fetch the | ||||
| # IDs... | ||||
| (ENSPsel <- names(igraph::V(gSTR)[BCsel])) | ||||
|  | ||||
| # Task: | ||||
| # ===== | ||||
| # IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT | ||||
| # We are going to use these IDs to produce some output for a submitted task: | ||||
| # therefore I need you to execute the following line, note the "seal" that this | ||||
| # returns, and not change myENSPsel later: | ||||
|  | ||||
| myENSPsel <- selectENSP(ENSPsel) | ||||
|  | ||||
| #  Next, to find what these proteins are... | ||||
|  | ||||
| # We could now Google for all of these IDs to learn more about them. But really, | ||||
| # googling for IDs one after the other, that would be lame. Let's instead use | ||||
| # the very, very useful biomaRt package to translate these Ensemble IDs into | ||||
| # gene symbols. | ||||
|  | ||||
|  | ||||
| # =    3  biomaRt  ============================================================= | ||||
|  | ||||
|  | ||||
| # IDs are just labels, but for _bio_informatics we need to learn more about the | ||||
| # biological function of the genes or proteins that we retrieve via graph data | ||||
| # mining. biomaRt is the tool of choice. It's a package distributed by the | ||||
| # bioconductor project. This here is not a biomaRt tutorial (that's for another | ||||
| # day), simply a few lines of sample code to get you started on the specific use | ||||
| # case of retrieving descriptions for ensembl protein IDs. | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("biomaRt", quietly = TRUE)) { | ||||
|   BiocManager::install("biomaRt") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = biomaRt)       # basic information | ||||
| #  browseVignettes("biomaRt")    # available vignettes | ||||
| #  data(package = "biomaRt")     # available datasets | ||||
|  | ||||
| # define which dataset to use ... this takes a while for download | ||||
| myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl") | ||||
|  | ||||
| # what filters are defined? | ||||
| ( filters <- biomaRt::listFilters(myMart) ) | ||||
|  | ||||
|  | ||||
| # and what attributes can we filter for? | ||||
| ( attributes <- biomaRt::listAttributes(myMart) ) | ||||
|  | ||||
|  | ||||
| # Soooo many options - let's look for the correct name of filters that are | ||||
| # useful for ENSP IDs ... | ||||
| filters[grep("ENSP", filters$description), ] | ||||
|  | ||||
| # ... and the correct attribute names for gene symbols and descriptions ... | ||||
| attributes[grep("symbol", attributes$description, ignore.case = TRUE), ] | ||||
| attributes[grep("description", attributes$description, ignore.case = TRUE), ] | ||||
|  | ||||
|  | ||||
| # ... so we can put this together: here is a syntax example: | ||||
| biomaRt::getBM(filters = "ensembl_peptide_id", | ||||
|                attributes = c("hgnc_symbol", | ||||
|                               "wikigene_description", | ||||
|                               "interpro_description", | ||||
|                               "phenotype_description"), | ||||
|                values = "ENSP00000000442", | ||||
|                mart = myMart) | ||||
|  | ||||
| # A simple loop will now get us the information for our 10 most central genes | ||||
| # from the human subset of STRING. | ||||
|  | ||||
| CPdefs <- list()  # Since we don't know how many matches one of our queries | ||||
| # will return, we'll put the result dataframes into a list. | ||||
|  | ||||
| for (ID in myENSPsel) { | ||||
|   CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id", | ||||
|                                  attributes = c("hgnc_symbol", | ||||
|                                                 "wikigene_description", | ||||
|                                                 "interpro_description", | ||||
|                                                 "phenotype_description"), | ||||
|                                  values = ID, | ||||
|                                  mart = myMart) | ||||
| } | ||||
|  | ||||
|  | ||||
| # So what are the proteins with the ten highest betweenness centralities? | ||||
| #  ... are you surprised? (I am! Really.) | ||||
|  | ||||
|  | ||||
| # =    4  Task for submission  ================================================= | ||||
|  | ||||
| # Write a loop that will go through your personalized list of Ensemble IDs and | ||||
| #    for each ID: | ||||
| #    --  print the ID, | ||||
| #    --  print the first row's HGNC symbol, | ||||
| #    --  print the first row's wikigene description. | ||||
| #    --  print the first row's phenotype. | ||||
| # | ||||
| # Write your thoughts about this group of genes. | ||||
| # | ||||
| # (Hint, you can structure your loop in the same way as the loop that | ||||
| # created CPdefs. ) | ||||
|  | ||||
| # Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code | ||||
| # for this loop and its output into your report if you are submitting | ||||
| # anything for credit for this unit. Please read the requirements carefully. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,252 +1,252 @@ | ||||
| # tocID <- "BIN-SEQA-Composition.R" | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-SEQA-Comparison unit | ||||
| # | ||||
| # Version: 1.2 | ||||
| # | ||||
| # Date:    2017-11  -  2020-09 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| # Versions: | ||||
| #           1.0    First live version 2017 | ||||
| #           0.1    First code copied from BCH441_A03_makeYFOlist.R | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # DO NOT SIMPLY  source()  THESE FILES! | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                      Line | ||||
| #TOC> ---------------------------------------------------------- | ||||
| #TOC>   1        Preparation                                  48 | ||||
| #TOC>   2        Aggregate properties                         69 | ||||
| #TOC>   3        Sequence Composition Enrichment             113 | ||||
| #TOC>   3.1        Barplot, and side-by-side barplot         136 | ||||
| #TOC>   3.2        Plotting ratios                           171 | ||||
| #TOC>   3.3        Plotting log ratios                       188 | ||||
| #TOC>   3.4        Sort by frequency                         204 | ||||
| #TOC>   3.5        Color by amino acid type                  221 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Preparation  ========================================================= | ||||
|  | ||||
| if (! requireNamespace("seqinr", quietly = TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = seqinr)       # basic information | ||||
| #  browseVignettes("seqinr")    # available vignettes | ||||
| #  data(package = "seqinr")     # available datasets | ||||
|  | ||||
| # Load a reference sequence to work with: | ||||
|  | ||||
| # If you have done the BIN-Storing_data unit: | ||||
|    source("makeProteinDB.R") | ||||
|    sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE))) | ||||
|    mySeq <- myDB$protein$sequence[sel] | ||||
|  | ||||
| # If not, use the yeast Mbp1 sequence: | ||||
|    mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence) | ||||
|  | ||||
|  | ||||
| # =    2  Aggregate properties  ================================================ | ||||
|  | ||||
|  | ||||
| # Let's try a simple function from seqinr: computing the pI of the sequence | ||||
| ?seqinr::computePI | ||||
|  | ||||
| # This takes as input a vector of upper-case AA codes | ||||
|  | ||||
| # We can use the function strsplit() to split the string | ||||
| # into single characters | ||||
|  | ||||
| (s <- strsplit(mySeq, "")) # splitting on the empty spring | ||||
|                            # splits into single characters | ||||
| s <- unlist(s)             # strsplit() returns a list! Why? | ||||
|                            # (But we don't need a list now...) | ||||
|  | ||||
| # Alternatively, seqinr provides | ||||
| # the function s2c() to convert strings into | ||||
| # character vectors (and c2s to convert them back). | ||||
|  | ||||
| seqinr::s2c(mySeq) | ||||
|  | ||||
|  | ||||
| seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point | ||||
| seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight | ||||
| seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of | ||||
|                                        # values along the sequence | ||||
|  | ||||
| # A true Labor of Love has gone into the | ||||
| # compilation of the "aaindex" data: | ||||
|  | ||||
| ?seqinr::aaindex | ||||
| data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it | ||||
|                                    # accessible as an R object | ||||
|  | ||||
| length(aaindex)  # no seqinr:: needed for the dataset since we just | ||||
|                  # "attached" it with data() | ||||
|  | ||||
| # Here are all the index descriptions | ||||
| for (i in 1:length(aaindex)) { | ||||
|   cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    3  Sequence Composition Enrichment  ===================================== | ||||
|  | ||||
|  | ||||
| # Lets use one of the indices to calculate and plot amino-acid | ||||
| # composition enrichment: | ||||
| aaindex[[459]]$D | ||||
|  | ||||
| # | ||||
| # Let's construct an enrichment plot to compare average frequencies | ||||
| # with the amino acid counts in our sequence. | ||||
|  | ||||
| (refData <- aaindex[[459]]$I)                # reference frequencies in % | ||||
| names(refData) <- seqinr::a(names(refData))  # change names to single-letter | ||||
|                                              # code using seqinr's "a()" function | ||||
| sum(refData) | ||||
| refData        # ... in % | ||||
|  | ||||
|  | ||||
| # tabulate the amino acid counts in mySeq | ||||
| (obsData <- table(seqinr::s2c(mySeq)))        # counts | ||||
| (obsData <- 100 * (obsData / sum(obsData)))   # frequencies | ||||
|  | ||||
|  | ||||
| # ==   3.1  Barplot, and side-by-side barplot  ================================= | ||||
|  | ||||
| barplot(obsData, col = "#CCCCCC", cex.names = 0.7) | ||||
| abline(h = 100/20, col="#BB0000") | ||||
|  | ||||
| barplot(refData, col = "#BB0000", cex.names = 0.7) | ||||
| abline(h = 100/20, col="#555555") | ||||
|  | ||||
| # Ok: first problem - the values in obsData are in alphabetical order. But the | ||||
| # values in refData are in alphabetical order of amino acid name: alanine, | ||||
| # arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this | ||||
| # order a lot - one of the old biochemistry tropes in the field. So we need to | ||||
| # re-order one of the vectors to match the other. That's easy though: | ||||
| refData | ||||
| (refData <- refData[names(obsData)]) | ||||
|  | ||||
| barplot(refData, col = "#BB0000", cex.names = 0.7) | ||||
| abline(h = 100/20, col="#555555") | ||||
|  | ||||
| # To compare the values, we want to see them in a barplot, side-by-side ... | ||||
| barplot(rbind(obsData, refData), | ||||
|         ylim = c(0, 12), | ||||
|         beside = TRUE, | ||||
|         col = c("#CCCCCC", "#BB0000"), | ||||
|         cex.names = 0.7) | ||||
| abline(h = 100/20, col="#00000044") | ||||
|  | ||||
| # ... and add a legend | ||||
| legend (x = 1, y = 12, | ||||
|         legend = c("mySeq", "Average composition"), | ||||
|         fill = c("#CCCCCC", "#BB0000"), | ||||
|         cex = 0.7, | ||||
|         bty = "n") | ||||
|  | ||||
|  | ||||
| # ==   3.2  Plotting ratios  =================================================== | ||||
|  | ||||
| # To better compare the values, we'll calculate ratios between | ||||
| # obsData and refData | ||||
|  | ||||
| barplot(obsData / refData, | ||||
|         col = "#CCCCCC", | ||||
|         ylab = "Sequence / Average", | ||||
|         ylim = c(0, 2.5), | ||||
|         cex.names = 0.7) | ||||
| abline(h = 1, col="#BB0000") | ||||
| abline(h = c(1/2, 2), lty = 2, col="#BB000055") | ||||
|  | ||||
| # ... but  ratios are not very good here, since the difference in height on the | ||||
| # plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted | ||||
| # lines) are exactly the same fold-difference ! | ||||
|  | ||||
| # ==   3.3  Plotting log ratios  =============================================== | ||||
|  | ||||
| # A better way to display this | ||||
| # is to plot log(ratios). | ||||
|  | ||||
| barplot(log(obsData / refData), | ||||
|         col = "#CCCCCC", | ||||
|         ylab = "log(Sequence / Average)", | ||||
|         ylim = log(c(1/3, 3)), | ||||
|         cex.names = 0.7) | ||||
| abline(h = log(1), col="#BB0000") | ||||
| abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") | ||||
|  | ||||
| # Note how the two-fold difference lines are now the same distance from the | ||||
| # line of equal ratio. | ||||
|  | ||||
| # ==   3.4  Sort by frequency  ================================================= | ||||
|  | ||||
| barplot(sort(log(obsData / refData), decreasing = TRUE), | ||||
|         ylim = log(c(1/3, 3)), | ||||
|         col = "#CCCCCC", | ||||
|         ylab = "log(Sequence / Average)", | ||||
|         cex.names = 0.7) | ||||
| abline(h = log(1), col="#BB0000") | ||||
| abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") | ||||
|  | ||||
| yTxt <- log(0.9) | ||||
| arrows(4, yTxt, 0, yTxt, length = 0.07) | ||||
| text(5.5, yTxt, "Enriched", cex = 0.7) | ||||
| yTxt <- log(1.1) | ||||
| arrows(20, yTxt, 24, yTxt, length = 0.07) | ||||
| text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) | ||||
|  | ||||
| # ==   3.5  Color by amino acid type  ========================================== | ||||
|  | ||||
| # Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R | ||||
| # script, or define your own. | ||||
|  | ||||
| barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5) | ||||
|  | ||||
| lR <- sort(log(obsData / refData), decreasing = TRUE) | ||||
| barplot(lR, | ||||
|         ylim = log(c(1/3, 3)), | ||||
|         col = AACOLS[names(lR)], | ||||
|         ylab = "log(Sequence / Average)", | ||||
|         cex.names = 0.7) | ||||
| abline(h = log(1), col="#00000055") | ||||
| abline(h = log(c(1/2, 2)), lty = 2, col="#00000033") | ||||
|  | ||||
| yTxt <- log(0.9) | ||||
| arrows(4, yTxt, 0, yTxt, length = 0.07) | ||||
| text(5.5, yTxt, "Enriched", cex = 0.7) | ||||
| yTxt <- log(1.1) | ||||
| arrows(20, yTxt, 24, yTxt, length = 0.07) | ||||
| text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) | ||||
|  | ||||
|  | ||||
| # Task: | ||||
| #   Interpret this plot. (Can you?) Which types of amino acids are enriched? | ||||
| #   Depleted? | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-SEQA-Composition.R" | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-SEQA-Comparison unit | ||||
| # | ||||
| # Version: 1.2 | ||||
| # | ||||
| # Date:    2017-11  -  2020-09 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| # Versions: | ||||
| #           1.0    First live version 2017 | ||||
| #           0.1    First code copied from BCH441_A03_makeYFOlist.R | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # DO NOT SIMPLY  source()  THESE FILES! | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                      Line | ||||
| #TOC> ---------------------------------------------------------- | ||||
| #TOC>   1        Preparation                                  48 | ||||
| #TOC>   2        Aggregate properties                         69 | ||||
| #TOC>   3        Sequence Composition Enrichment             113 | ||||
| #TOC>   3.1        Barplot, and side-by-side barplot         136 | ||||
| #TOC>   3.2        Plotting ratios                           171 | ||||
| #TOC>   3.3        Plotting log ratios                       188 | ||||
| #TOC>   3.4        Sort by frequency                         204 | ||||
| #TOC>   3.5        Color by amino acid type                  221 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Preparation  ========================================================= | ||||
|  | ||||
| if (! requireNamespace("seqinr", quietly = TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = seqinr)       # basic information | ||||
| #  browseVignettes("seqinr")    # available vignettes | ||||
| #  data(package = "seqinr")     # available datasets | ||||
|  | ||||
| # Load a reference sequence to work with: | ||||
|  | ||||
| # If you have done the BIN-Storing_data unit: | ||||
|    source("makeProteinDB.R") | ||||
|    sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE))) | ||||
|    mySeq <- myDB$protein$sequence[sel] | ||||
|  | ||||
| # If not, use the yeast Mbp1 sequence: | ||||
|    mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence) | ||||
|  | ||||
|  | ||||
| # =    2  Aggregate properties  ================================================ | ||||
|  | ||||
|  | ||||
| # Let's try a simple function from seqinr: computing the pI of the sequence | ||||
| ?seqinr::computePI | ||||
|  | ||||
| # This takes as input a vector of upper-case AA codes | ||||
|  | ||||
| # We can use the function strsplit() to split the string | ||||
| # into single characters | ||||
|  | ||||
| (s <- strsplit(mySeq, "")) # splitting on the empty spring | ||||
|                            # splits into single characters | ||||
| s <- unlist(s)             # strsplit() returns a list! Why? | ||||
|                            # (But we don't need a list now...) | ||||
|  | ||||
| # Alternatively, seqinr provides | ||||
| # the function s2c() to convert strings into | ||||
| # character vectors (and c2s to convert them back). | ||||
|  | ||||
| seqinr::s2c(mySeq) | ||||
|  | ||||
|  | ||||
| seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point | ||||
| seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight | ||||
| seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of | ||||
|                                        # values along the sequence | ||||
|  | ||||
| # A true Labor of Love has gone into the | ||||
| # compilation of the "aaindex" data: | ||||
|  | ||||
| ?seqinr::aaindex | ||||
| data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it | ||||
|                                    # accessible as an R object | ||||
|  | ||||
| length(aaindex)  # no seqinr:: needed for the dataset since we just | ||||
|                  # "attached" it with data() | ||||
|  | ||||
| # Here are all the index descriptions | ||||
| for (i in 1:length(aaindex)) { | ||||
|   cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    3  Sequence Composition Enrichment  ===================================== | ||||
|  | ||||
|  | ||||
| # Lets use one of the indices to calculate and plot amino-acid | ||||
| # composition enrichment: | ||||
| aaindex[[459]]$D | ||||
|  | ||||
| # | ||||
| # Let's construct an enrichment plot to compare average frequencies | ||||
| # with the amino acid counts in our sequence. | ||||
|  | ||||
| (refData <- aaindex[[459]]$I)                # reference frequencies in % | ||||
| names(refData) <- seqinr::a(names(refData))  # change names to single-letter | ||||
|                                              # code using seqinr's "a()" function | ||||
| sum(refData) | ||||
| refData        # ... in % | ||||
|  | ||||
|  | ||||
| # tabulate the amino acid counts in mySeq | ||||
| (obsData <- table(seqinr::s2c(mySeq)))        # counts | ||||
| (obsData <- 100 * (obsData / sum(obsData)))   # frequencies | ||||
|  | ||||
|  | ||||
| # ==   3.1  Barplot, and side-by-side barplot  ================================= | ||||
|  | ||||
| barplot(obsData, col = "#CCCCCC", cex.names = 0.7) | ||||
| abline(h = 100/20, col="#BB0000") | ||||
|  | ||||
| barplot(refData, col = "#BB0000", cex.names = 0.7) | ||||
| abline(h = 100/20, col="#555555") | ||||
|  | ||||
| # Ok: first problem - the values in obsData are in alphabetical order. But the | ||||
| # values in refData are in alphabetical order of amino acid name: alanine, | ||||
| # arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this | ||||
| # order a lot - one of the old biochemistry tropes in the field. So we need to | ||||
| # re-order one of the vectors to match the other. That's easy though: | ||||
| refData | ||||
| (refData <- refData[names(obsData)]) | ||||
|  | ||||
| barplot(refData, col = "#BB0000", cex.names = 0.7) | ||||
| abline(h = 100/20, col="#555555") | ||||
|  | ||||
| # To compare the values, we want to see them in a barplot, side-by-side ... | ||||
| barplot(rbind(obsData, refData), | ||||
|         ylim = c(0, 12), | ||||
|         beside = TRUE, | ||||
|         col = c("#CCCCCC", "#BB0000"), | ||||
|         cex.names = 0.7) | ||||
| abline(h = 100/20, col="#00000044") | ||||
|  | ||||
| # ... and add a legend | ||||
| legend (x = 1, y = 12, | ||||
|         legend = c("mySeq", "Average composition"), | ||||
|         fill = c("#CCCCCC", "#BB0000"), | ||||
|         cex = 0.7, | ||||
|         bty = "n") | ||||
|  | ||||
|  | ||||
| # ==   3.2  Plotting ratios  =================================================== | ||||
|  | ||||
| # To better compare the values, we'll calculate ratios between | ||||
| # obsData and refData | ||||
|  | ||||
| barplot(obsData / refData, | ||||
|         col = "#CCCCCC", | ||||
|         ylab = "Sequence / Average", | ||||
|         ylim = c(0, 2.5), | ||||
|         cex.names = 0.7) | ||||
| abline(h = 1, col="#BB0000") | ||||
| abline(h = c(1/2, 2), lty = 2, col="#BB000055") | ||||
|  | ||||
| # ... but  ratios are not very good here, since the difference in height on the | ||||
| # plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted | ||||
| # lines) are exactly the same fold-difference ! | ||||
|  | ||||
| # ==   3.3  Plotting log ratios  =============================================== | ||||
|  | ||||
| # A better way to display this | ||||
| # is to plot log(ratios). | ||||
|  | ||||
| barplot(log(obsData / refData), | ||||
|         col = "#CCCCCC", | ||||
|         ylab = "log(Sequence / Average)", | ||||
|         ylim = log(c(1/3, 3)), | ||||
|         cex.names = 0.7) | ||||
| abline(h = log(1), col="#BB0000") | ||||
| abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") | ||||
|  | ||||
| # Note how the two-fold difference lines are now the same distance from the | ||||
| # line of equal ratio. | ||||
|  | ||||
| # ==   3.4  Sort by frequency  ================================================= | ||||
|  | ||||
| barplot(sort(log(obsData / refData), decreasing = TRUE), | ||||
|         ylim = log(c(1/3, 3)), | ||||
|         col = "#CCCCCC", | ||||
|         ylab = "log(Sequence / Average)", | ||||
|         cex.names = 0.7) | ||||
| abline(h = log(1), col="#BB0000") | ||||
| abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") | ||||
|  | ||||
| yTxt <- log(0.9) | ||||
| arrows(4, yTxt, 0, yTxt, length = 0.07) | ||||
| text(5.5, yTxt, "Enriched", cex = 0.7) | ||||
| yTxt <- log(1.1) | ||||
| arrows(20, yTxt, 24, yTxt, length = 0.07) | ||||
| text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) | ||||
|  | ||||
| # ==   3.5  Color by amino acid type  ========================================== | ||||
|  | ||||
| # Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R | ||||
| # script, or define your own. | ||||
|  | ||||
| barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5) | ||||
|  | ||||
| lR <- sort(log(obsData / refData), decreasing = TRUE) | ||||
| barplot(lR, | ||||
|         ylim = log(c(1/3, 3)), | ||||
|         col = AACOLS[names(lR)], | ||||
|         ylab = "log(Sequence / Average)", | ||||
|         cex.names = 0.7) | ||||
| abline(h = log(1), col="#00000055") | ||||
| abline(h = log(c(1/2, 2)), lty = 2, col="#00000033") | ||||
|  | ||||
| yTxt <- log(0.9) | ||||
| arrows(4, yTxt, 0, yTxt, length = 0.07) | ||||
| text(5.5, yTxt, "Enriched", cex = 0.7) | ||||
| yTxt <- log(1.1) | ||||
| arrows(20, yTxt, 24, yTxt, length = 0.07) | ||||
| text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) | ||||
|  | ||||
|  | ||||
| # Task: | ||||
| #   Interpret this plot. (Can you?) Which types of amino acids are enriched? | ||||
| #   Depleted? | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										788
									
								
								BIN-Sequence.R
									
									
									
									
									
								
							
							
						
						
									
										788
									
								
								BIN-Sequence.R
									
									
									
									
									
								
							| @@ -1,394 +1,394 @@ | ||||
| # tocID <- "BIN-Sequence.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-Sequence unit. | ||||
| # | ||||
| # Version:  1.5 | ||||
| # | ||||
| # Date:     2017-09  - 2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.5    2020 Updates | ||||
| #           1.4    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.3    Update set.seed() usage | ||||
| #           1.2    Removed irrelevant task. How did that even get in there? smh | ||||
| #           1.1    Add chartr() | ||||
| #           1.0    First live version 2017. | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                Line | ||||
| #TOC> ---------------------------------------------------- | ||||
| #TOC>   1        Prepare                                63 | ||||
| #TOC>   2        Storing Sequence                       80 | ||||
| #TOC>   3        String properties                     109 | ||||
| #TOC>   4        Substrings                            116 | ||||
| #TOC>   5        Creating strings: sprintf()           137 | ||||
| #TOC>   6        Changing strings                      172 | ||||
| #TOC>   6.1.1          Changing case                   174 | ||||
| #TOC>   6.1.2          Reverse                         179 | ||||
| #TOC>   6.1.3          Change characters               183 | ||||
| #TOC>   6.1.4          Substitute characters           211 | ||||
| #TOC>   6.2        stringi and stringr                 231 | ||||
| #TOC>   6.3        dbSanitizeSequence()                241 | ||||
| #TOC>   7        Permuting and sampling                253 | ||||
| #TOC>   7.1        Permutations                        260 | ||||
| #TOC>   7.2        Sampling                            306 | ||||
| #TOC>   7.2.1          Equiprobable characters         308 | ||||
| #TOC>   7.2.2          Defined probability vector      350 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Prepare  ============================================================= | ||||
|  | ||||
| # Much basic sequence handling is supported by the Bioconductor package | ||||
| # Biostrings. | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # =    2  Storing Sequence  ==================================================== | ||||
|  | ||||
|  | ||||
| # Sequences can be represented and stored as vectors of single characters ... | ||||
| (v <- c("D", "I", "V", "M", "T", "Q")) | ||||
|  | ||||
| # ... as strings ... | ||||
| (s <- "DIVMTQ") | ||||
|  | ||||
| # ... or as more complex objects with rich metadata e.g. as a Biostrings | ||||
| # DNAstring, RNAstring, AAString, etc. | ||||
| (a <- Biostrings::AAString("DIVMTQ")) | ||||
|  | ||||
| # ... and all of these representations can be interconverted: | ||||
|  | ||||
| # string to vector ... | ||||
| unlist(strsplit(s, "")) | ||||
|  | ||||
| # vector to string ... | ||||
| paste(v, sep = "", collapse = "") | ||||
|  | ||||
| # ... and AAstring to plain string. | ||||
| as.character(a) | ||||
|  | ||||
| # Since operations with character vectors trivially follow all other vector | ||||
| # conventions and syntax, and we will look at Biostrings methods in more | ||||
| # detail in a later unit, we will focus on basic strings in the following. | ||||
|  | ||||
|  | ||||
| # =    3  String properties  =================================================== | ||||
|  | ||||
|  | ||||
| length(s) # why ??? | ||||
| nchar(s)  # Aha! | ||||
|  | ||||
|  | ||||
| # =    4  Substrings  ========================================================== | ||||
|  | ||||
| # Use the substr() function | ||||
| substr(s, 2, 4) | ||||
|  | ||||
| # or the similar substring() | ||||
| substring(s, 2, 4) | ||||
|  | ||||
| # Note: both functions are vectorized (i.e. they operate on vectors | ||||
| # of arguments, you don't need to loop over input)... | ||||
| myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA") | ||||
| substr(   myBiCodes, 1, 3) | ||||
| substring(myBiCodes, 1, 3) | ||||
|  | ||||
| # ... however only substring() will also use vectors for start and stop | ||||
| s <- "gatattgtgatgacccagtaa"       # a DNA sequence | ||||
| (vI <- seq(1, nchar(s), by = 3))   # an index vector | ||||
| substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet | ||||
| substring(s, vI, vI+2)             # ... returns all triplets | ||||
|  | ||||
|  | ||||
| # =    5  Creating strings: sprintf()  ========================================= | ||||
|  | ||||
|  | ||||
| # Sprintf is a very smart, very powerful function and has cognates in all | ||||
| # other programming languages. It has a bit of a  learning curve, but this is | ||||
| # totally worth it: | ||||
| # the function takes a format string, and a list of other arguments. It returns | ||||
| # a formatted string. Here are some examples - watch carefully for sprintf() | ||||
| # calls elsewhere in the code. | ||||
|  | ||||
| sprintf("Just a string.") | ||||
| sprintf("A string and the number %d.", 5) | ||||
| sprintf("More numbers: %d ate %d.", 7, 9) # Sorry | ||||
| sprintf("Pi is ~ %1.2f ...", pi) | ||||
| sprintf("or more accurately ~ %1.11f.", pi) | ||||
| x <- "bottles of beer" | ||||
| N <- 99 | ||||
| sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.", | ||||
|         N, x, N, x, "one down, and pass it around", N - 1, x) | ||||
|  | ||||
| # Note that in the last example, the value of the string was displayed with | ||||
| # R's usual print-formatting function and therefore the line-break "\n" did | ||||
| # not actually break the line. To have line breaks, tabs etc, you need to use | ||||
| # cat() to display the string: | ||||
|  | ||||
| for (i in N:(N-4)) { | ||||
|   cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n", | ||||
|               i, x, i, x, "one down, and pass it around", i - 1, x)) | ||||
| } | ||||
|  | ||||
| # sprintf() is vectorized: if one of its parameters is a vector, it | ||||
| # will generate one output string for each of the vector's elements: | ||||
| cat(sprintf("\n%s fish", c("one", "two", "red", "blue"))) | ||||
|  | ||||
|  | ||||
| # =    6  Changing strings  ==================================================== | ||||
|  | ||||
| # ===   6.1.1  Changing case | ||||
| tolower(s) | ||||
| toupper(tolower(s)) | ||||
|  | ||||
|  | ||||
| # ===   6.1.2  Reverse | ||||
| # (This used to work in Biostrings, apparently it doesn't work anymore. Why?) | ||||
| # Biostrings::str_rev(s) | ||||
| # The following works, of course, but awkward: | ||||
| s | ||||
| paste0(rev(unlist(strsplit(s, ""))), collapse = "") | ||||
|  | ||||
| # reverse complement | ||||
| COMP <- c("t", "g", "c", "a") | ||||
| names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names | ||||
| s | ||||
| paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "") | ||||
|  | ||||
|  | ||||
| # ===   6.1.3  Change characters | ||||
| # chartr(old, new, x) maps all characters in x that appear in "old" to the | ||||
| # correpsonding character in "new." Kind of like the COMP vector above ... | ||||
|  | ||||
| chartr("aeio", "uuuu", "We hold these truths to be self-evident ...") | ||||
|  | ||||
| # One could implement toupper() and tolower() with this - remember that R has | ||||
| # character vectors of uppercase and lowercase letters as language constants. | ||||
| chartr(paste0(letters, collapse = ""), | ||||
|        paste0(LETTERS, collapse = ""), | ||||
|        "Twinkle, twinkle little star, how I wonder what you are.") | ||||
|  | ||||
| # One amusing way to use the function  is for a reversible substitution | ||||
| # cypher. | ||||
| alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789" | ||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | ||||
| ( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") ) | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| # encode ... | ||||
| (x <- chartr(alBet, myCypher, "... seven for a secret, never to be told.")) | ||||
|  | ||||
| # decode ... | ||||
| chartr(myCypher, alBet, x) | ||||
| # (Nb. substitution cyphers are easy to crack!) | ||||
|  | ||||
|  | ||||
| # ===   6.1.4  Substitute characters | ||||
| # gsub can change lengths. | ||||
| #   Example: implementing the binary Fibonacci sequence: | ||||
| #   0 -> 1; 1 -> 10 , in three nested gsub() statements | ||||
| ( s <- 1 ) | ||||
| ( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) ) | ||||
|  | ||||
| # Iterate this line a few times ... | ||||
| # | ||||
| # cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html | ||||
| # for the features of the sequence. | ||||
|  | ||||
| # I use gsub() often to delete unwanted characters ... | ||||
| # ... select something, and substitute the empty string for it. | ||||
| (s <- gsub("-", "", s)) | ||||
|  | ||||
| # For example: clean up a sequence | ||||
| # copy/paste from UniProt | ||||
| (s <- "        10         20         30         40         50 | ||||
| MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ") | ||||
|  | ||||
|  | ||||
| # remove numbers | ||||
| (s <- gsub("[0-9]", "", s)) | ||||
|  | ||||
| # remove "whitespace" (spaces, tabs, line breaks)... | ||||
| (s <- gsub("\\s", "", s)) | ||||
|  | ||||
| # ==   6.2  stringi and stringr  =============================================== | ||||
|  | ||||
| # But there are also specialized functions eg. to remove leading/trailing | ||||
| # whitespace which may be important to sanitize user input etc. Have a look at | ||||
| # the function descriptions for the stringr and the stringi package. stringr is | ||||
| # part of the tidyverse, and for the most part a wrapper for stringi functions. | ||||
| # https://github.com/tidyverse/stringr | ||||
|  | ||||
|  | ||||
|  | ||||
| # ==   6.3  dbSanitizeSequence()  ============================================== | ||||
|  | ||||
| # In our learning units, we use a function dbSanitizeSequence() to clean up | ||||
| # sequences that may be copy/pasted from Web-sources | ||||
|  | ||||
| cat( s <- ">FASTA header will be removed | ||||
| 10         20         30         40         50 | ||||
| MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " ) | ||||
|  | ||||
| dbSanitizeSequence(s) | ||||
|  | ||||
|  | ||||
| # =    7  Permuting and sampling  ============================================== | ||||
|  | ||||
|  | ||||
| # An important aspect of working with strings is generating random strings | ||||
| # with given statistical properties: reference items to evaluate significance. | ||||
|  | ||||
|  | ||||
| # ==   7.1  Permutations  ====================================================== | ||||
|  | ||||
|  | ||||
| # One way to produce such reference items is to permute a string. A permuted | ||||
| # string has the same composition as the original, but all positional | ||||
| # information is lost. The sample() function can be used to permute: | ||||
|  | ||||
| # This is the sequence of the ompA secretion signal | ||||
| (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) | ||||
|  | ||||
| (x <- sample(s, length(s)))  # permuted | ||||
|  | ||||
| # Here's a small example how such permuted strings may be useful. As you look | ||||
| # at the ompA sequence, you suspect that the two lysines near the +-charged | ||||
| # N-terminus may not be accidental, but selected for a positively charged | ||||
| # N-terminus. What is the chance that such a sequence has two lysines close to | ||||
| # the N-terminus simply by chance? Or put differently: what is the average | ||||
| # distance of two lysines in such a sequence to the N-terminus. First, we | ||||
| # need an expression that measures the distance. A simple use of the which() | ||||
| # function will do just fine. | ||||
|  | ||||
| which(s == "K")        # shows they are in position 2 and 3, so ... | ||||
| mean(which(s == "K"))  # ... gives us the average, and ... | ||||
| mean(which(x == "K"))  # ... gives us the average of the permuted sequence. | ||||
|  | ||||
| # So what does the distribution look like? Lets do 10,000 trials. | ||||
|  | ||||
| (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) | ||||
| N <- 10000 | ||||
| d <- numeric(N) | ||||
|  | ||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | ||||
| for (i in 1:N) { | ||||
|   d[i] <- mean(which(sample(s, length(s)) == "K")) | ||||
| } | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| hist(d, breaks = 20) | ||||
| abline(v = 2.5, lwd = 2, col = "firebrick") | ||||
| sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the | ||||
|               # N-terminus or more. That's just below the signifcance | ||||
|               # threshold of 5 %. It's a trend, but to be sure we are looking | ||||
|               # at a biological effect we would need to see more | ||||
|               # sequences. | ||||
|  | ||||
|  | ||||
| # ==   7.2  Sampling  ========================================================== | ||||
|  | ||||
| # ===   7.2.1  Equiprobable characters | ||||
|  | ||||
| # Assume you need a large random-nucleotide string for some statistical model. | ||||
| # How to create such a string? sample() can easily create it: | ||||
|  | ||||
| nuc <- c("A", "C", "G", "T") | ||||
| N <- 100 | ||||
|  | ||||
| set.seed(16818)                        # set RNG seed for repeatable randomness | ||||
| v <- sample(nuc, N, replace = TRUE) | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| (mySeq <- paste(v, collapse = "")) | ||||
|  | ||||
| # What's the GC content? | ||||
| table(v) | ||||
| sum(table(v)[c("G", "C")]) # 51 is close to expected | ||||
|  | ||||
| # What's the number of CpG motifs? Easy to check with the stringi | ||||
| # stri_match_all() function | ||||
|  | ||||
| if (! requireNamespace("stringi", quietly = TRUE)) { | ||||
|   install.packages("stringi") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = stringi)       # basic information | ||||
| #  browseVignettes("stringi")    # available vignettes | ||||
| #  data(package = "stringi")     # available datasets | ||||
|  | ||||
|  | ||||
| (x <- stringi::stri_match_all(mySeq, regex = "CG")) | ||||
| length(unlist(x)) | ||||
|  | ||||
| # Now you could compare that number with yeast DNA sequences, and determine | ||||
| # whether there are more or less CpG motifs than expected by chance. | ||||
| # (cf. https://en.wikipedia.org/wiki/CpG_site) | ||||
| # But hold on: is that a fair comparison? sample() gives us all four nucleotides | ||||
| # with the same probability. But the yeast genomic DNA GC content is only | ||||
| # 38%. So you would expect fewer CpG motifs based on the statistical properties | ||||
| # of the smaller number of Cs and Gs - before biology even comes into play. How | ||||
| # do we account for that? | ||||
|  | ||||
| # ===   7.2.2  Defined probability vector | ||||
|  | ||||
| # This is where we need to know how to create samples with specific probability | ||||
| # distributions. A crude hack would be to create a sampling source vector with | ||||
| # 19 C, 19 G, 31 A and 31 T | ||||
| c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31)) | ||||
| # ... but that doesn't scale if the numeric accuracy needs to be higher. | ||||
| # | ||||
| # However sample() has an argument that takes care of that: you can explicitly | ||||
| # specify the probabilities with which each element of the the sampling vector | ||||
| # should be chosen: | ||||
|  | ||||
| nuc <- c("A", "C", "G", "T") | ||||
| N <- 100 | ||||
| myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities | ||||
|  | ||||
| set.seed(16818)                       # set RNG seed for repeatable randomness | ||||
| v <- sample(nuc, N, prob = myProb, replace = TRUE) | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| (mySeq <- paste(v, collapse = "")) | ||||
|  | ||||
| # What's the GC content? | ||||
| table(v) | ||||
| sum(table(v)[c("G", "C")]) # Close to expected | ||||
|  | ||||
| # What's the number of CpG motifs? | ||||
| (x <- stringi::stri_match_all(mySeq, regex = "CG")) | ||||
| # ... not a single one in this case. | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "BIN-Sequence.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the BIN-Sequence unit. | ||||
| # | ||||
| # Version:  1.5 | ||||
| # | ||||
| # Date:     2017-09  - 2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.5    2020 Updates | ||||
| #           1.4    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.3    Update set.seed() usage | ||||
| #           1.2    Removed irrelevant task. How did that even get in there? smh | ||||
| #           1.1    Add chartr() | ||||
| #           1.0    First live version 2017. | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                Line | ||||
| #TOC> ---------------------------------------------------- | ||||
| #TOC>   1        Prepare                                63 | ||||
| #TOC>   2        Storing Sequence                       80 | ||||
| #TOC>   3        String properties                     109 | ||||
| #TOC>   4        Substrings                            116 | ||||
| #TOC>   5        Creating strings: sprintf()           137 | ||||
| #TOC>   6        Changing strings                      172 | ||||
| #TOC>   6.1.1          Changing case                   174 | ||||
| #TOC>   6.1.2          Reverse                         179 | ||||
| #TOC>   6.1.3          Change characters               183 | ||||
| #TOC>   6.1.4          Substitute characters           211 | ||||
| #TOC>   6.2        stringi and stringr                 231 | ||||
| #TOC>   6.3        dbSanitizeSequence()                241 | ||||
| #TOC>   7        Permuting and sampling                253 | ||||
| #TOC>   7.1        Permutations                        260 | ||||
| #TOC>   7.2        Sampling                            306 | ||||
| #TOC>   7.2.1          Equiprobable characters         308 | ||||
| #TOC>   7.2.2          Defined probability vector      350 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Prepare  ============================================================= | ||||
|  | ||||
| # Much basic sequence handling is supported by the Bioconductor package | ||||
| # Biostrings. | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # =    2  Storing Sequence  ==================================================== | ||||
|  | ||||
|  | ||||
| # Sequences can be represented and stored as vectors of single characters ... | ||||
| (v <- c("D", "I", "V", "M", "T", "Q")) | ||||
|  | ||||
| # ... as strings ... | ||||
| (s <- "DIVMTQ") | ||||
|  | ||||
| # ... or as more complex objects with rich metadata e.g. as a Biostrings | ||||
| # DNAstring, RNAstring, AAString, etc. | ||||
| (a <- Biostrings::AAString("DIVMTQ")) | ||||
|  | ||||
| # ... and all of these representations can be interconverted: | ||||
|  | ||||
| # string to vector ... | ||||
| unlist(strsplit(s, "")) | ||||
|  | ||||
| # vector to string ... | ||||
| paste(v, sep = "", collapse = "") | ||||
|  | ||||
| # ... and AAstring to plain string. | ||||
| as.character(a) | ||||
|  | ||||
| # Since operations with character vectors trivially follow all other vector | ||||
| # conventions and syntax, and we will look at Biostrings methods in more | ||||
| # detail in a later unit, we will focus on basic strings in the following. | ||||
|  | ||||
|  | ||||
| # =    3  String properties  =================================================== | ||||
|  | ||||
|  | ||||
| length(s) # why ??? | ||||
| nchar(s)  # Aha! | ||||
|  | ||||
|  | ||||
| # =    4  Substrings  ========================================================== | ||||
|  | ||||
| # Use the substr() function | ||||
| substr(s, 2, 4) | ||||
|  | ||||
| # or the similar substring() | ||||
| substring(s, 2, 4) | ||||
|  | ||||
| # Note: both functions are vectorized (i.e. they operate on vectors | ||||
| # of arguments, you don't need to loop over input)... | ||||
| myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA") | ||||
| substr(   myBiCodes, 1, 3) | ||||
| substring(myBiCodes, 1, 3) | ||||
|  | ||||
| # ... however only substring() will also use vectors for start and stop | ||||
| s <- "gatattgtgatgacccagtaa"       # a DNA sequence | ||||
| (vI <- seq(1, nchar(s), by = 3))   # an index vector | ||||
| substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet | ||||
| substring(s, vI, vI+2)             # ... returns all triplets | ||||
|  | ||||
|  | ||||
| # =    5  Creating strings: sprintf()  ========================================= | ||||
|  | ||||
|  | ||||
| # Sprintf is a very smart, very powerful function and has cognates in all | ||||
| # other programming languages. It has a bit of a  learning curve, but this is | ||||
| # totally worth it: | ||||
| # the function takes a format string, and a list of other arguments. It returns | ||||
| # a formatted string. Here are some examples - watch carefully for sprintf() | ||||
| # calls elsewhere in the code. | ||||
|  | ||||
| sprintf("Just a string.") | ||||
| sprintf("A string and the number %d.", 5) | ||||
| sprintf("More numbers: %d ate %d.", 7, 9) # Sorry | ||||
| sprintf("Pi is ~ %1.2f ...", pi) | ||||
| sprintf("or more accurately ~ %1.11f.", pi) | ||||
| x <- "bottles of beer" | ||||
| N <- 99 | ||||
| sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.", | ||||
|         N, x, N, x, "one down, and pass it around", N - 1, x) | ||||
|  | ||||
| # Note that in the last example, the value of the string was displayed with | ||||
| # R's usual print-formatting function and therefore the line-break "\n" did | ||||
| # not actually break the line. To have line breaks, tabs etc, you need to use | ||||
| # cat() to display the string: | ||||
|  | ||||
| for (i in N:(N-4)) { | ||||
|   cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n", | ||||
|               i, x, i, x, "one down, and pass it around", i - 1, x)) | ||||
| } | ||||
|  | ||||
| # sprintf() is vectorized: if one of its parameters is a vector, it | ||||
| # will generate one output string for each of the vector's elements: | ||||
| cat(sprintf("\n%s fish", c("one", "two", "red", "blue"))) | ||||
|  | ||||
|  | ||||
| # =    6  Changing strings  ==================================================== | ||||
|  | ||||
| # ===   6.1.1  Changing case | ||||
| tolower(s) | ||||
| toupper(tolower(s)) | ||||
|  | ||||
|  | ||||
| # ===   6.1.2  Reverse | ||||
| # (This used to work in Biostrings, apparently it doesn't work anymore. Why?) | ||||
| # Biostrings::str_rev(s) | ||||
| # The following works, of course, but awkward: | ||||
| s | ||||
| paste0(rev(unlist(strsplit(s, ""))), collapse = "") | ||||
|  | ||||
| # reverse complement | ||||
| COMP <- c("t", "g", "c", "a") | ||||
| names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names | ||||
| s | ||||
| paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "") | ||||
|  | ||||
|  | ||||
| # ===   6.1.3  Change characters | ||||
| # chartr(old, new, x) maps all characters in x that appear in "old" to the | ||||
| # correpsonding character in "new." Kind of like the COMP vector above ... | ||||
|  | ||||
| chartr("aeio", "uuuu", "We hold these truths to be self-evident ...") | ||||
|  | ||||
| # One could implement toupper() and tolower() with this - remember that R has | ||||
| # character vectors of uppercase and lowercase letters as language constants. | ||||
| chartr(paste0(letters, collapse = ""), | ||||
|        paste0(LETTERS, collapse = ""), | ||||
|        "Twinkle, twinkle little star, how I wonder what you are.") | ||||
|  | ||||
| # One amusing way to use the function  is for a reversible substitution | ||||
| # cypher. | ||||
| alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789" | ||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | ||||
| ( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") ) | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| # encode ... | ||||
| (x <- chartr(alBet, myCypher, "... seven for a secret, never to be told.")) | ||||
|  | ||||
| # decode ... | ||||
| chartr(myCypher, alBet, x) | ||||
| # (Nb. substitution cyphers are easy to crack!) | ||||
|  | ||||
|  | ||||
| # ===   6.1.4  Substitute characters | ||||
| # gsub can change lengths. | ||||
| #   Example: implementing the binary Fibonacci sequence: | ||||
| #   0 -> 1; 1 -> 10 , in three nested gsub() statements | ||||
| ( s <- 1 ) | ||||
| ( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) ) | ||||
|  | ||||
| # Iterate this line a few times ... | ||||
| # | ||||
| # cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html | ||||
| # for the features of the sequence. | ||||
|  | ||||
| # I use gsub() often to delete unwanted characters ... | ||||
| # ... select something, and substitute the empty string for it. | ||||
| (s <- gsub("-", "", s)) | ||||
|  | ||||
| # For example: clean up a sequence | ||||
| # copy/paste from UniProt | ||||
| (s <- "        10         20         30         40         50 | ||||
| MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ") | ||||
|  | ||||
|  | ||||
| # remove numbers | ||||
| (s <- gsub("[0-9]", "", s)) | ||||
|  | ||||
| # remove "whitespace" (spaces, tabs, line breaks)... | ||||
| (s <- gsub("\\s", "", s)) | ||||
|  | ||||
| # ==   6.2  stringi and stringr  =============================================== | ||||
|  | ||||
| # But there are also specialized functions eg. to remove leading/trailing | ||||
| # whitespace which may be important to sanitize user input etc. Have a look at | ||||
| # the function descriptions for the stringr and the stringi package. stringr is | ||||
| # part of the tidyverse, and for the most part a wrapper for stringi functions. | ||||
| # https://github.com/tidyverse/stringr | ||||
|  | ||||
|  | ||||
|  | ||||
| # ==   6.3  dbSanitizeSequence()  ============================================== | ||||
|  | ||||
| # In our learning units, we use a function dbSanitizeSequence() to clean up | ||||
| # sequences that may be copy/pasted from Web-sources | ||||
|  | ||||
| cat( s <- ">FASTA header will be removed | ||||
| 10         20         30         40         50 | ||||
| MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " ) | ||||
|  | ||||
| dbSanitizeSequence(s) | ||||
|  | ||||
|  | ||||
| # =    7  Permuting and sampling  ============================================== | ||||
|  | ||||
|  | ||||
| # An important aspect of working with strings is generating random strings | ||||
| # with given statistical properties: reference items to evaluate significance. | ||||
|  | ||||
|  | ||||
| # ==   7.1  Permutations  ====================================================== | ||||
|  | ||||
|  | ||||
| # One way to produce such reference items is to permute a string. A permuted | ||||
| # string has the same composition as the original, but all positional | ||||
| # information is lost. The sample() function can be used to permute: | ||||
|  | ||||
| # This is the sequence of the ompA secretion signal | ||||
| (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) | ||||
|  | ||||
| (x <- sample(s, length(s)))  # permuted | ||||
|  | ||||
| # Here's a small example how such permuted strings may be useful. As you look | ||||
| # at the ompA sequence, you suspect that the two lysines near the +-charged | ||||
| # N-terminus may not be accidental, but selected for a positively charged | ||||
| # N-terminus. What is the chance that such a sequence has two lysines close to | ||||
| # the N-terminus simply by chance? Or put differently: what is the average | ||||
| # distance of two lysines in such a sequence to the N-terminus. First, we | ||||
| # need an expression that measures the distance. A simple use of the which() | ||||
| # function will do just fine. | ||||
|  | ||||
| which(s == "K")        # shows they are in position 2 and 3, so ... | ||||
| mean(which(s == "K"))  # ... gives us the average, and ... | ||||
| mean(which(x == "K"))  # ... gives us the average of the permuted sequence. | ||||
|  | ||||
| # So what does the distribution look like? Lets do 10,000 trials. | ||||
|  | ||||
| (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) | ||||
| N <- 10000 | ||||
| d <- numeric(N) | ||||
|  | ||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | ||||
| for (i in 1:N) { | ||||
|   d[i] <- mean(which(sample(s, length(s)) == "K")) | ||||
| } | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| hist(d, breaks = 20) | ||||
| abline(v = 2.5, lwd = 2, col = "firebrick") | ||||
| sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the | ||||
|               # N-terminus or more. That's just below the signifcance | ||||
|               # threshold of 5 %. It's a trend, but to be sure we are looking | ||||
|               # at a biological effect we would need to see more | ||||
|               # sequences. | ||||
|  | ||||
|  | ||||
| # ==   7.2  Sampling  ========================================================== | ||||
|  | ||||
| # ===   7.2.1  Equiprobable characters | ||||
|  | ||||
| # Assume you need a large random-nucleotide string for some statistical model. | ||||
| # How to create such a string? sample() can easily create it: | ||||
|  | ||||
| nuc <- c("A", "C", "G", "T") | ||||
| N <- 100 | ||||
|  | ||||
| set.seed(16818)                        # set RNG seed for repeatable randomness | ||||
| v <- sample(nuc, N, replace = TRUE) | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| (mySeq <- paste(v, collapse = "")) | ||||
|  | ||||
| # What's the GC content? | ||||
| table(v) | ||||
| sum(table(v)[c("G", "C")]) # 51 is close to expected | ||||
|  | ||||
| # What's the number of CpG motifs? Easy to check with the stringi | ||||
| # stri_match_all() function | ||||
|  | ||||
| if (! requireNamespace("stringi", quietly = TRUE)) { | ||||
|   install.packages("stringi") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = stringi)       # basic information | ||||
| #  browseVignettes("stringi")    # available vignettes | ||||
| #  data(package = "stringi")     # available datasets | ||||
|  | ||||
|  | ||||
| (x <- stringi::stri_match_all(mySeq, regex = "CG")) | ||||
| length(unlist(x)) | ||||
|  | ||||
| # Now you could compare that number with yeast DNA sequences, and determine | ||||
| # whether there are more or less CpG motifs than expected by chance. | ||||
| # (cf. https://en.wikipedia.org/wiki/CpG_site) | ||||
| # But hold on: is that a fair comparison? sample() gives us all four nucleotides | ||||
| # with the same probability. But the yeast genomic DNA GC content is only | ||||
| # 38%. So you would expect fewer CpG motifs based on the statistical properties | ||||
| # of the smaller number of Cs and Gs - before biology even comes into play. How | ||||
| # do we account for that? | ||||
|  | ||||
| # ===   7.2.2  Defined probability vector | ||||
|  | ||||
| # This is where we need to know how to create samples with specific probability | ||||
| # distributions. A crude hack would be to create a sampling source vector with | ||||
| # 19 C, 19 G, 31 A and 31 T | ||||
| c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31)) | ||||
| # ... but that doesn't scale if the numeric accuracy needs to be higher. | ||||
| # | ||||
| # However sample() has an argument that takes care of that: you can explicitly | ||||
| # specify the probabilities with which each element of the the sampling vector | ||||
| # should be chosen: | ||||
|  | ||||
| nuc <- c("A", "C", "G", "T") | ||||
| N <- 100 | ||||
| myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities | ||||
|  | ||||
| set.seed(16818)                       # set RNG seed for repeatable randomness | ||||
| v <- sample(nuc, N, prob = myProb, replace = TRUE) | ||||
| set.seed(NULL)                         # reset the RNG | ||||
|  | ||||
| (mySeq <- paste(v, collapse = "")) | ||||
|  | ||||
| # What's the GC content? | ||||
| table(v) | ||||
| sum(table(v)[c("G", "C")]) # Close to expected | ||||
|  | ||||
| # What's the number of CpG motifs? | ||||
| (x <- stringi::stri_match_all(mySeq, regex = "CG")) | ||||
| # ... not a single one in this case. | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										1368
									
								
								BIN-Storing_data.R
									
									
									
									
									
								
							
							
						
						
									
										1368
									
								
								BIN-Storing_data.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,349 +1,349 @@ | ||||
| # tocID <- "FND-Genetic_code.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the FND-Genetic_code unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017  10  -  2019  01 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0.1  Comment on "incomplete final line" warning in FASTA | ||||
| #           1.0    First live version | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                            Line | ||||
| #TOC> ---------------------------------------------------------------- | ||||
| #TOC>   1        Storing the genetic code                           45 | ||||
| #TOC>   1.1        Genetic code in Biostrings                       63 | ||||
| #TOC>   2        Working with the genetic code                      94 | ||||
| #TOC>   2.1        Translate a sequence.                           129 | ||||
| #TOC>   3        An alternative representation: 3D array           212 | ||||
| #TOC>   3.1        Print a Genetic code table                      246 | ||||
| #TOC>   4        Tasks                                             272 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Storing the genetic code  ============================================ | ||||
|  | ||||
| # The genetic code maps trinucleotide codons to amino acids. To store it, we | ||||
| # need some mechanism to associate the two representations. The most | ||||
| # convenient way to do that is a "named vector" which holds the amino acid | ||||
| # code and assigns the codons as names to its elements. | ||||
|  | ||||
| x <- c("M", "H", "H", "*", "*", "*") | ||||
| names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA") | ||||
| x | ||||
|  | ||||
| # Then we can access the vector by the codon as name, and retrieve the | ||||
| # amino acid ... | ||||
|  | ||||
| x["ATG"] | ||||
| x["CAC"] | ||||
| x["TAA"] | ||||
|  | ||||
| # ... or the names of elements, to retrieve the codon(s) | ||||
| names(x)[x == "M"] | ||||
| names(x)[x == "H"] | ||||
| names(x)[x == "*"] | ||||
|  | ||||
|  | ||||
| # ==   1.1  Genetic code in Biostrings  ======================================== | ||||
|  | ||||
| # Coveniently, the standard genetic code as well as its alternatives are | ||||
| # available in the Bioconductor "Biostrings" package: | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # The standard genetic code vector | ||||
| Biostrings::GENETIC_CODE | ||||
|  | ||||
| # The table of genetic codes. This information corresponds to this page | ||||
| # at the NCBI: | ||||
| # https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes | ||||
| Biostrings::GENETIC_CODE_TABLE | ||||
|  | ||||
| # Most of the alternative codes are mitochondrial codes. The id of the | ||||
| # Alternative Yeast Nuclear code is "12" | ||||
| Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear | ||||
|  | ||||
|  | ||||
| # =    2  Working with the genetic code  ======================================= | ||||
|  | ||||
| # We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it | ||||
| # to a "local" variable, rather than retrieving it from the package all the | ||||
| # time. | ||||
|  | ||||
| GC <- Biostrings::GENETIC_CODE | ||||
|  | ||||
| # This is a named vector of characters ... | ||||
|  | ||||
| str(GC) | ||||
|  | ||||
| # ... which also stores the alternative initiation codons TTG and CTG in | ||||
| # an attribute of the vector. (Alternative initiation codons sometimes are | ||||
| # used instead of ATG to intiate translation, if translation is not initiated | ||||
| # at ATG thses are still translated with fMet.) | ||||
|  | ||||
| attr(GC, "alt_init_codons") | ||||
|  | ||||
| # But the key to use this vector is in the "names" which we use for subsetting | ||||
| # the list of amino acids in whatever way we need. | ||||
| names(GC) | ||||
|  | ||||
| # The translation of "TGG" ... | ||||
| GC["TGG"] | ||||
|  | ||||
| # All stop codons | ||||
| names(GC)[GC == "*"] | ||||
|  | ||||
| # All start codons | ||||
| names(GC)[GC == "M"] # ... or | ||||
| c(names(GC)[GC == "M"], | ||||
|   attr(GC, "alt_init_codons")) | ||||
|  | ||||
|  | ||||
| # ==   2.1  Translate a sequence.  ============================================= | ||||
|  | ||||
|  | ||||
| # I have provided a gene sequence in the data directory: | ||||
| # S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence. | ||||
|  | ||||
| # read it | ||||
| mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||
|  | ||||
| # You will notice that this generates a Warning message: | ||||
| #      Warning message: | ||||
| #        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") : | ||||
| #        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa' | ||||
|  | ||||
| # The reason for this is that the last character of the file is the letter "A" | ||||
| # and not a "\n" line break. This file is exactly how it was sent from the | ||||
| # NCBI server; I think good, defensive programming practice would have been to | ||||
| # include some kind of an end-marker in the file, like a final "\n". This helps | ||||
| # us recognize an incomplete transmission. Let's parse the actual sequence from | ||||
| # the file, and then check for completeness. | ||||
|  | ||||
|  | ||||
| head(mbp1) | ||||
|  | ||||
| # drop the first line (header) | ||||
| mbp1 <- mbp1[-1] | ||||
| head(mbp1) | ||||
|  | ||||
| # concatenate it all to a single string | ||||
| mbp1 <- paste(mbp1, sep = "", collapse = "") | ||||
|  | ||||
| # how long is it? | ||||
| nchar(mbp1) | ||||
|  | ||||
| # how many codons? | ||||
| nchar(mbp1)/3 | ||||
|  | ||||
| # That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a | ||||
| # first verification that the file we read is complete, the nucleotides of a | ||||
| # complete ORF should be divisible by 3. | ||||
|  | ||||
| # Extract the codons. There are many ways to split a long string into chunks | ||||
| # of three characters. Here we use the Biostrings  codons()  function. codons() | ||||
| # requires an object of type DNAstring - a special kind of string with | ||||
| # attributes that are useful for Biostrings. Thus we convert the sequence first | ||||
| # with DNAstring(), then split it up, then convert it into a plain | ||||
| # character vector. | ||||
| mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1))) | ||||
|  | ||||
| head(mbp1Codons) | ||||
|  | ||||
| # now translate each codon | ||||
|  | ||||
| mbp1AA <- character(834) | ||||
| for (i in seq_along(mbp1Codons)) { | ||||
|   mbp1AA[i] <- GC[mbp1Codons[i]] | ||||
| } | ||||
|  | ||||
| head(mbp1Codons) | ||||
| head(mbp1AA) | ||||
|  | ||||
| tail(mbp1Codons) | ||||
| tail(mbp1AA) # Note the stop! | ||||
|  | ||||
| # The TAA "ochre" stop codon is our second verification that the nucleotide | ||||
| # sequence is complete: a stop codon can't appear internally in an ORF. | ||||
|  | ||||
| # We can work with the mbp1AA vector, for example to tabulate the | ||||
| # amino acid frequencies: | ||||
| table(mbp1AA) | ||||
| sort(table(mbp1AA), decreasing = TRUE) | ||||
|  | ||||
| # Or we can paste all elements together into a single string. But let's remove | ||||
| # the stop, it's not actually a part of the sequence. To remove the last element | ||||
| # of a vector, re-assign it with a vector minus the index of the last element: | ||||
| mbp1AA <- mbp1AA[-(length(mbp1AA))] | ||||
| tail(mbp1AA) # Note the stop is gone! | ||||
|  | ||||
| # paste it together, collapsing the elements using an empty string as the | ||||
| # separation-character (i.e.: nothing) | ||||
| (Mbp1 <- paste(mbp1AA, sep = "", collapse = "")) | ||||
|  | ||||
|  | ||||
| # =    3  An alternative representation: 3D array  ============================= | ||||
|  | ||||
|  | ||||
| # We don't use 3D arrays often - usually just 2D tables and data frames, so | ||||
| # here is a good opportunity to review the syntax of 3D arrays with a | ||||
| # genetic code cube: | ||||
|  | ||||
| # Initialize, using A G C T as the names of the elements in each dimension | ||||
| cCube <- array(data     = character(64), | ||||
|                dim      = c(4, 4, 4), | ||||
|                dimnames = list(c("A", "G", "C", "T"), | ||||
|                                c("A", "G", "C", "T"), | ||||
|                                c("A", "G", "C", "T"))) | ||||
|  | ||||
| # fill it with amino acid codes using three nested loops | ||||
| for (i in 1:4) { | ||||
|   for (j in 1:4) { | ||||
|     for (k in 1:4) { | ||||
|       myCodon <- paste(dimnames(cCube)[[1]][i], | ||||
|                        dimnames(cCube)[[2]][j], | ||||
|                        dimnames(cCube)[[3]][k], | ||||
|                        sep = "", | ||||
|                        collapse = "") | ||||
|       cCube[i, j, k] <- GC[myCodon] | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| # confirm | ||||
| cCube["A", "T", "G"] # methionine | ||||
| cCube["T", "T", "T"] # phenylalanine | ||||
| cCube["T", "A", "G"] # stop (amber) | ||||
|  | ||||
|  | ||||
|  | ||||
| # ==   3.1  Print a Genetic code table  ======================================== | ||||
|  | ||||
|  | ||||
| # The data structure of our cCube is well suited to print a table. In the | ||||
| # "standard" way to print the genetic code, we write codons with the same | ||||
| # second nucleotide in columns, and arrange rows in blocks of same | ||||
| # first nucleotide, varying the third nucleotide fastest. This maximizes the | ||||
| # similarity of adjacent amino acids in the table if we print the | ||||
| # nucleotides in the order T C A G. It's immidiately obvious that the code | ||||
| # is not random: the universal genetic code is exceptionally error tolerant in | ||||
| # the sense that mutations (or single-nucleotide translation errors) are likely | ||||
| # to result in an amino acid with similar biophysical properties as the | ||||
| # original. | ||||
|  | ||||
| nuc <- c("T", "C", "A", "G") | ||||
|  | ||||
| # (calling variables f, s, t to indicate first, second, and third position ...) | ||||
| for (f in nuc) {      # first varies in blocks | ||||
|   for (t in nuc) {    # third varies in columns | ||||
|     for (s in nuc) {  # second varies in rows | ||||
|       cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t])) | ||||
|     } | ||||
|     cat("\n") | ||||
|   } | ||||
|   cat("\n") | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    4  Tasks  =============================================================== | ||||
|  | ||||
|  | ||||
| # Task: What do you need to change to print the table with U instead | ||||
| #         of T? Try it. | ||||
|  | ||||
|  | ||||
| # Task: Point mutations are more often transitions (purine -> purine; | ||||
| #         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine; | ||||
| #         pyrimidine -> purine), even though twice as many transversions | ||||
| #         are possible in the code. This is most likely due a deamination / | ||||
| #         tautomerization process that favours C -> T changes. If the code | ||||
| #         indeed minimizes the effect of mutations, you would expect that | ||||
| #         codons that differ by a transition code for more similar amino acids | ||||
| #         than codons that differ by a transversion. Is that true? List the set | ||||
| #         of all amino acid pairs that are encoded by codons with a C -> T | ||||
| #         transition. Then list the set of amino acid pairs with a C -> A | ||||
| #         transversion. Which set of pairs is more similar? | ||||
|  | ||||
|  | ||||
| # Task: How many stop codons do the two mbp1-gene derived amino acid sequences | ||||
| #         have if you translate them in the 2. or the 3. frame? | ||||
|  | ||||
|  | ||||
| # Task: How does the amino acid composition change if you translate the mbp1 | ||||
| #         gene with the Alternative Yeast Nuclear code that is used by the | ||||
| #         "GTC clade" of fungi? | ||||
| #         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code ) | ||||
|  | ||||
| # Solution: | ||||
|  | ||||
|     # Fetch the code | ||||
|     Biostrings::GENETIC_CODE_TABLE | ||||
|     Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"] | ||||
|     altYcode <- Biostrings::getGeneticCode("12") | ||||
|  | ||||
|     # what's the difference? | ||||
|     (delta <- which(Biostrings::GENETIC_CODE != altYcode)) | ||||
|  | ||||
|     Biostrings::GENETIC_CODE[delta] | ||||
|     altYcode[delta] | ||||
|  | ||||
|     # translate | ||||
|     altYAA <- character(834) | ||||
|     for (i in seq_along(mbp1Codons)) { | ||||
|       altYAA[i] <- altYcode[mbp1Codons[i]] | ||||
|     } | ||||
|  | ||||
|     table(mbp1AA) | ||||
|     table(altYAA) | ||||
|  | ||||
| # Task: The genetic code has significant redundacy, i.e. there are up to six | ||||
| #         codons that code for the same amino acid. Write code that lists how | ||||
| #         many amino acids are present how often i.e. it should tell you that | ||||
| #         two amino acids are encoded only with a single codon, three amino | ||||
| #         acids have six codons, etc. Solution below, but don't peek. There | ||||
| #         are many possible ways to do this. | ||||
| # | ||||
| # | ||||
| # Solution: | ||||
| ( x <- table(table(Biostrings::GENETIC_CODE)) ) | ||||
|  | ||||
| # confirm | ||||
| sum(x * as.numeric(names(x))) | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "FND-Genetic_code.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the FND-Genetic_code unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017  10  -  2019  01 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0.1  Comment on "incomplete final line" warning in FASTA | ||||
| #           1.0    First live version | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                            Line | ||||
| #TOC> ---------------------------------------------------------------- | ||||
| #TOC>   1        Storing the genetic code                           45 | ||||
| #TOC>   1.1        Genetic code in Biostrings                       63 | ||||
| #TOC>   2        Working with the genetic code                      94 | ||||
| #TOC>   2.1        Translate a sequence.                           129 | ||||
| #TOC>   3        An alternative representation: 3D array           212 | ||||
| #TOC>   3.1        Print a Genetic code table                      246 | ||||
| #TOC>   4        Tasks                                             272 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Storing the genetic code  ============================================ | ||||
|  | ||||
| # The genetic code maps trinucleotide codons to amino acids. To store it, we | ||||
| # need some mechanism to associate the two representations. The most | ||||
| # convenient way to do that is a "named vector" which holds the amino acid | ||||
| # code and assigns the codons as names to its elements. | ||||
|  | ||||
| x <- c("M", "H", "H", "*", "*", "*") | ||||
| names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA") | ||||
| x | ||||
|  | ||||
| # Then we can access the vector by the codon as name, and retrieve the | ||||
| # amino acid ... | ||||
|  | ||||
| x["ATG"] | ||||
| x["CAC"] | ||||
| x["TAA"] | ||||
|  | ||||
| # ... or the names of elements, to retrieve the codon(s) | ||||
| names(x)[x == "M"] | ||||
| names(x)[x == "H"] | ||||
| names(x)[x == "*"] | ||||
|  | ||||
|  | ||||
| # ==   1.1  Genetic code in Biostrings  ======================================== | ||||
|  | ||||
| # Coveniently, the standard genetic code as well as its alternatives are | ||||
| # available in the Bioconductor "Biostrings" package: | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # The standard genetic code vector | ||||
| Biostrings::GENETIC_CODE | ||||
|  | ||||
| # The table of genetic codes. This information corresponds to this page | ||||
| # at the NCBI: | ||||
| # https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes | ||||
| Biostrings::GENETIC_CODE_TABLE | ||||
|  | ||||
| # Most of the alternative codes are mitochondrial codes. The id of the | ||||
| # Alternative Yeast Nuclear code is "12" | ||||
| Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear | ||||
|  | ||||
|  | ||||
| # =    2  Working with the genetic code  ======================================= | ||||
|  | ||||
| # We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it | ||||
| # to a "local" variable, rather than retrieving it from the package all the | ||||
| # time. | ||||
|  | ||||
| GC <- Biostrings::GENETIC_CODE | ||||
|  | ||||
| # This is a named vector of characters ... | ||||
|  | ||||
| str(GC) | ||||
|  | ||||
| # ... which also stores the alternative initiation codons TTG and CTG in | ||||
| # an attribute of the vector. (Alternative initiation codons sometimes are | ||||
| # used instead of ATG to intiate translation, if translation is not initiated | ||||
| # at ATG thses are still translated with fMet.) | ||||
|  | ||||
| attr(GC, "alt_init_codons") | ||||
|  | ||||
| # But the key to use this vector is in the "names" which we use for subsetting | ||||
| # the list of amino acids in whatever way we need. | ||||
| names(GC) | ||||
|  | ||||
| # The translation of "TGG" ... | ||||
| GC["TGG"] | ||||
|  | ||||
| # All stop codons | ||||
| names(GC)[GC == "*"] | ||||
|  | ||||
| # All start codons | ||||
| names(GC)[GC == "M"] # ... or | ||||
| c(names(GC)[GC == "M"], | ||||
|   attr(GC, "alt_init_codons")) | ||||
|  | ||||
|  | ||||
| # ==   2.1  Translate a sequence.  ============================================= | ||||
|  | ||||
|  | ||||
| # I have provided a gene sequence in the data directory: | ||||
| # S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence. | ||||
|  | ||||
| # read it | ||||
| mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||
|  | ||||
| # You will notice that this generates a Warning message: | ||||
| #      Warning message: | ||||
| #        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") : | ||||
| #        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa' | ||||
|  | ||||
| # The reason for this is that the last character of the file is the letter "A" | ||||
| # and not a "\n" line break. This file is exactly how it was sent from the | ||||
| # NCBI server; I think good, defensive programming practice would have been to | ||||
| # include some kind of an end-marker in the file, like a final "\n". This helps | ||||
| # us recognize an incomplete transmission. Let's parse the actual sequence from | ||||
| # the file, and then check for completeness. | ||||
|  | ||||
|  | ||||
| head(mbp1) | ||||
|  | ||||
| # drop the first line (header) | ||||
| mbp1 <- mbp1[-1] | ||||
| head(mbp1) | ||||
|  | ||||
| # concatenate it all to a single string | ||||
| mbp1 <- paste(mbp1, sep = "", collapse = "") | ||||
|  | ||||
| # how long is it? | ||||
| nchar(mbp1) | ||||
|  | ||||
| # how many codons? | ||||
| nchar(mbp1)/3 | ||||
|  | ||||
| # That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a | ||||
| # first verification that the file we read is complete, the nucleotides of a | ||||
| # complete ORF should be divisible by 3. | ||||
|  | ||||
| # Extract the codons. There are many ways to split a long string into chunks | ||||
| # of three characters. Here we use the Biostrings  codons()  function. codons() | ||||
| # requires an object of type DNAstring - a special kind of string with | ||||
| # attributes that are useful for Biostrings. Thus we convert the sequence first | ||||
| # with DNAstring(), then split it up, then convert it into a plain | ||||
| # character vector. | ||||
| mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1))) | ||||
|  | ||||
| head(mbp1Codons) | ||||
|  | ||||
| # now translate each codon | ||||
|  | ||||
| mbp1AA <- character(834) | ||||
| for (i in seq_along(mbp1Codons)) { | ||||
|   mbp1AA[i] <- GC[mbp1Codons[i]] | ||||
| } | ||||
|  | ||||
| head(mbp1Codons) | ||||
| head(mbp1AA) | ||||
|  | ||||
| tail(mbp1Codons) | ||||
| tail(mbp1AA) # Note the stop! | ||||
|  | ||||
| # The TAA "ochre" stop codon is our second verification that the nucleotide | ||||
| # sequence is complete: a stop codon can't appear internally in an ORF. | ||||
|  | ||||
| # We can work with the mbp1AA vector, for example to tabulate the | ||||
| # amino acid frequencies: | ||||
| table(mbp1AA) | ||||
| sort(table(mbp1AA), decreasing = TRUE) | ||||
|  | ||||
| # Or we can paste all elements together into a single string. But let's remove | ||||
| # the stop, it's not actually a part of the sequence. To remove the last element | ||||
| # of a vector, re-assign it with a vector minus the index of the last element: | ||||
| mbp1AA <- mbp1AA[-(length(mbp1AA))] | ||||
| tail(mbp1AA) # Note the stop is gone! | ||||
|  | ||||
| # paste it together, collapsing the elements using an empty string as the | ||||
| # separation-character (i.e.: nothing) | ||||
| (Mbp1 <- paste(mbp1AA, sep = "", collapse = "")) | ||||
|  | ||||
|  | ||||
| # =    3  An alternative representation: 3D array  ============================= | ||||
|  | ||||
|  | ||||
| # We don't use 3D arrays often - usually just 2D tables and data frames, so | ||||
| # here is a good opportunity to review the syntax of 3D arrays with a | ||||
| # genetic code cube: | ||||
|  | ||||
| # Initialize, using A G C T as the names of the elements in each dimension | ||||
| cCube <- array(data     = character(64), | ||||
|                dim      = c(4, 4, 4), | ||||
|                dimnames = list(c("A", "G", "C", "T"), | ||||
|                                c("A", "G", "C", "T"), | ||||
|                                c("A", "G", "C", "T"))) | ||||
|  | ||||
| # fill it with amino acid codes using three nested loops | ||||
| for (i in 1:4) { | ||||
|   for (j in 1:4) { | ||||
|     for (k in 1:4) { | ||||
|       myCodon <- paste(dimnames(cCube)[[1]][i], | ||||
|                        dimnames(cCube)[[2]][j], | ||||
|                        dimnames(cCube)[[3]][k], | ||||
|                        sep = "", | ||||
|                        collapse = "") | ||||
|       cCube[i, j, k] <- GC[myCodon] | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| # confirm | ||||
| cCube["A", "T", "G"] # methionine | ||||
| cCube["T", "T", "T"] # phenylalanine | ||||
| cCube["T", "A", "G"] # stop (amber) | ||||
|  | ||||
|  | ||||
|  | ||||
| # ==   3.1  Print a Genetic code table  ======================================== | ||||
|  | ||||
|  | ||||
| # The data structure of our cCube is well suited to print a table. In the | ||||
| # "standard" way to print the genetic code, we write codons with the same | ||||
| # second nucleotide in columns, and arrange rows in blocks of same | ||||
| # first nucleotide, varying the third nucleotide fastest. This maximizes the | ||||
| # similarity of adjacent amino acids in the table if we print the | ||||
| # nucleotides in the order T C A G. It's immidiately obvious that the code | ||||
| # is not random: the universal genetic code is exceptionally error tolerant in | ||||
| # the sense that mutations (or single-nucleotide translation errors) are likely | ||||
| # to result in an amino acid with similar biophysical properties as the | ||||
| # original. | ||||
|  | ||||
| nuc <- c("T", "C", "A", "G") | ||||
|  | ||||
| # (calling variables f, s, t to indicate first, second, and third position ...) | ||||
| for (f in nuc) {      # first varies in blocks | ||||
|   for (t in nuc) {    # third varies in columns | ||||
|     for (s in nuc) {  # second varies in rows | ||||
|       cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t])) | ||||
|     } | ||||
|     cat("\n") | ||||
|   } | ||||
|   cat("\n") | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    4  Tasks  =============================================================== | ||||
|  | ||||
|  | ||||
| # Task: What do you need to change to print the table with U instead | ||||
| #         of T? Try it. | ||||
|  | ||||
|  | ||||
| # Task: Point mutations are more often transitions (purine -> purine; | ||||
| #         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine; | ||||
| #         pyrimidine -> purine), even though twice as many transversions | ||||
| #         are possible in the code. This is most likely due a deamination / | ||||
| #         tautomerization process that favours C -> T changes. If the code | ||||
| #         indeed minimizes the effect of mutations, you would expect that | ||||
| #         codons that differ by a transition code for more similar amino acids | ||||
| #         than codons that differ by a transversion. Is that true? List the set | ||||
| #         of all amino acid pairs that are encoded by codons with a C -> T | ||||
| #         transition. Then list the set of amino acid pairs with a C -> A | ||||
| #         transversion. Which set of pairs is more similar? | ||||
|  | ||||
|  | ||||
| # Task: How many stop codons do the two mbp1-gene derived amino acid sequences | ||||
| #         have if you translate them in the 2. or the 3. frame? | ||||
|  | ||||
|  | ||||
| # Task: How does the amino acid composition change if you translate the mbp1 | ||||
| #         gene with the Alternative Yeast Nuclear code that is used by the | ||||
| #         "GTC clade" of fungi? | ||||
| #         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code ) | ||||
|  | ||||
| # Solution: | ||||
|  | ||||
|     # Fetch the code | ||||
|     Biostrings::GENETIC_CODE_TABLE | ||||
|     Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"] | ||||
|     altYcode <- Biostrings::getGeneticCode("12") | ||||
|  | ||||
|     # what's the difference? | ||||
|     (delta <- which(Biostrings::GENETIC_CODE != altYcode)) | ||||
|  | ||||
|     Biostrings::GENETIC_CODE[delta] | ||||
|     altYcode[delta] | ||||
|  | ||||
|     # translate | ||||
|     altYAA <- character(834) | ||||
|     for (i in seq_along(mbp1Codons)) { | ||||
|       altYAA[i] <- altYcode[mbp1Codons[i]] | ||||
|     } | ||||
|  | ||||
|     table(mbp1AA) | ||||
|     table(altYAA) | ||||
|  | ||||
| # Task: The genetic code has significant redundacy, i.e. there are up to six | ||||
| #         codons that code for the same amino acid. Write code that lists how | ||||
| #         many amino acids are present how often i.e. it should tell you that | ||||
| #         two amino acids are encoded only with a single codon, three amino | ||||
| #         acids have six codons, etc. Solution below, but don't peek. There | ||||
| #         are many possible ways to do this. | ||||
| # | ||||
| # | ||||
| # Solution: | ||||
| ( x <- table(table(Biostrings::GENETIC_CODE)) ) | ||||
|  | ||||
| # confirm | ||||
| sum(x * as.numeric(names(x))) | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,224 +1,224 @@ | ||||
| # tocID <- "FND-STA-Information_theory.R" | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the FND-STA-Information_theory unit. | ||||
| # | ||||
| # Version:  0.2.1 | ||||
| # | ||||
| # Date:     2017 - 2021 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           0.2.1  Maintenance | ||||
| #           0.2    Under development | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                  Line | ||||
| #TOC> -------------------------------------- | ||||
| #TOC>   1        ___Section___            39 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  ___Section___  ======================================================= | ||||
|  | ||||
| # What level of information is "significant" | ||||
|  | ||||
| # Assume the background distribution is the database frequencies of | ||||
| # amino acids: | ||||
|  | ||||
| AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to | ||||
| # sum to 1.0 | ||||
| AAref["A"] <- 0.0904 | ||||
| AAref["C"] <- 0.0123 | ||||
| AAref["D"] <- 0.0545 | ||||
| AAref["E"] <- 0.0617 | ||||
| AAref["F"] <- 0.0394 | ||||
| AAref["G"] <- 0.0724 | ||||
| AAref["H"] <- 0.0221 | ||||
| AAref["I"] <- 0.0573 | ||||
| AAref["K"] <- 0.0504 | ||||
| AAref["L"] <- 0.0986 | ||||
| AAref["M"] <- 0.0240 | ||||
| AAref["N"] <- 0.0392 | ||||
| AAref["P"] <- 0.0486 | ||||
| AAref["Q"] <- 0.0381 | ||||
| AAref["R"] <- 0.0570 | ||||
| AAref["S"] <- 0.0673 | ||||
| AAref["T"] <- 0.0558 | ||||
| AAref["V"] <- 0.0686 | ||||
| AAref["W"] <- 0.0129 | ||||
| AAref["Y"] <- 0.0294 | ||||
| sum(AAref) | ||||
|  | ||||
| # Function to calculate Shannon entropy | ||||
| H <- function(pmf) { | ||||
|   # Calculate Shannon entropy | ||||
|   # Parameters: | ||||
|   #   pmf (numeric) probability mass function: a vector of states and | ||||
|   #                 associated probabilities. Each element of | ||||
|   #                 pmf must be in (0, 1] and sum(pmf) must be 1. | ||||
|   # Value: | ||||
|   #   Shannon entropy in bits. | ||||
|   # Examples: | ||||
|   #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random | ||||
|   #                                         # nucleotide sequence | ||||
|   #   H(1)     # If all elements are the same, entropy is zero | ||||
|   # | ||||
|   if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) { | ||||
|     stop("Input is not a discrete probability distribution.") | ||||
|   } | ||||
|   H <- -sum(pmf * (log(pmf) / log(2))) | ||||
|   return(H) | ||||
| } | ||||
|  | ||||
| # Why use all.equal()? Exact comparisons with floating point numbers are | ||||
| # brittle. Consider for example: | ||||
| 1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1 | ||||
| print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777 | ||||
| # all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8 | ||||
|  | ||||
|  | ||||
|  | ||||
| # Entropy of the database frequencies (in bits): | ||||
| (Href <- H(AAref)) | ||||
|  | ||||
| # for comparison: entropy if all amino acids are equiprobable | ||||
| H(rep(0.05, 20)) | ||||
|  | ||||
|  | ||||
| # Set up a simulation to estimate the distribution of Information values | ||||
| # from random sequences drawn from AAref. This is the distribution for the | ||||
| # statistical null hypothesis: | ||||
| nObs <- 15                      # number of observations (e.g aligned sequences) | ||||
| # nObs <- 80 | ||||
| nTrials <- 10000                # number of trials | ||||
| IObs <- numeric(nTrials)        # vector to store Information in each trial | ||||
| simCounts <- numeric(20)        # vector to tabulate our information ... | ||||
| names(simCounts) <- names(AAref)# ... with the names of AAref | ||||
|  | ||||
|  | ||||
| for (i in 1:nTrials) {  # simulate ... | ||||
|  | ||||
|   # sample AAref letters, nObs times, with the probabilities of AAref: | ||||
|   AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) | ||||
|  | ||||
|   x <- table(AAobs)                            # table simulated observations | ||||
|   simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 | ||||
|   simCounts[names(x)] <- x                     # overwrite with observed counts | ||||
|   simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts | ||||
|   Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H | ||||
|   IObs[i] <- Href - Hobs                       # store information | ||||
| } | ||||
|  | ||||
| # evaluate | ||||
| hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25) | ||||
| abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC") | ||||
|  | ||||
| # The purple lines are drawn at the 5% quantiles of the Iobs distributions - | ||||
| # i.e. an actual observation that lies outside the purple lines is deemed | ||||
| # "significant"(1)(2). Of course, this is only true to the degree that the | ||||
| # database frequencies are a valid model for the null-hypothesis on the | ||||
| # sequence position we are considering here. | ||||
|  | ||||
| #  (1) If we use 5% quantiles, this means a value is significantly larger | ||||
| #      than expected, and we ignore cases when the value is < 0; if we | ||||
| #      consider both smaller and larger values, we need to use 2.5% quantiles, | ||||
| #      since 5% of all observations lie outside the 0.025 and 0.975 | ||||
| #      quantiles. | ||||
| # | ||||
| #  (2) For an actual observation of counts, we calculate its observed | ||||
| #      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1). | ||||
|  | ||||
|  | ||||
| # You can probably now appreciate that information is a bit of a shortcut for | ||||
| # biological sequences, and does not really take the different inherent | ||||
| # frequencies based on the character of the amino acids into account. For | ||||
| # example, L is the most frequent and C is the least frequent, but if we have an | ||||
| # alignment of 1000 sequences and we see that the frequencies for L and C are | ||||
| # swapped, that would be _very_ surprising - nevertheless, the information would | ||||
| # be 0. In order to take that into account, we should actually compute | ||||
| # Kullback-Leibler divergences. | ||||
|  | ||||
|  | ||||
| # Swap C and L frequencies | ||||
| p <- AAref | ||||
| q <- AAref | ||||
| q["L"] <- AAref["C"] | ||||
| q["C"] <- AAref["L"] | ||||
| H(p) | ||||
| H(q) | ||||
|  | ||||
| KLdiv <- function(p, q) { | ||||
|   # p and q are two pmfs of discrete probability distributions | ||||
|   # with the same outcomes, which are nowhere 0. | ||||
|   # Value:  Kullback-Leibler divergence  sum(p * log( p / q))). | ||||
|  | ||||
|   if (length(p) != length(q)) { | ||||
|     stop("PANIC: input vector lengths differ!") | ||||
|   } | ||||
|   if (any(c((p == 0), (q == 0)))) { | ||||
|     stop("PANIC: 0's found in input vectors!") | ||||
|   } | ||||
|  | ||||
|   return(sum(p * log( p / q ))) | ||||
| } | ||||
|  | ||||
| KLdiv(p, p) | ||||
| KLdiv(p, q) | ||||
|  | ||||
|  | ||||
| nObs <- 15                      # number of observations (e.g aligned sequences) | ||||
| # nObs <- 80 | ||||
| nTrials <- 10000                # number of trials | ||||
| KLdivObs <- numeric(nTrials)        # vector to store Information in each trial | ||||
| simCounts <- numeric(20)        # vector to tabulate our information ... | ||||
| names(simCounts) <- names(AAref)# ... with the names of AAref | ||||
|  | ||||
|  | ||||
| for (i in 1:nTrials) {  # simulate ... | ||||
|  | ||||
|   # sample AAref letters, nObs times, with the probabilities of AAref: | ||||
|   AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) | ||||
|  | ||||
|   x <- table(AAobs)                            # table simulated observations | ||||
|   simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 | ||||
|   simCounts[names(x)] <- x                     # overwrite with observed counts | ||||
|   simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts | ||||
|   simCounts <- simCounts/sum(simCounts)        # counts to frequency | ||||
|   KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv | ||||
| } | ||||
|  | ||||
| # evaluate | ||||
| hist(KLdivObs, col = "#C9F4E3", breaks = 25) | ||||
| abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC") | ||||
| quantile(KLdivObs, 0.992) | ||||
|  | ||||
| # Running the simulation with KL does not give a fundamentally | ||||
| # different behaviour - since we are just randomly sampling. But KL would be | ||||
| # more sensitive in case there is biological selection, where the sampling is no | ||||
| # longer random. If I run the same simulation, with nObs <- 80 but calculating | ||||
| # KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L | ||||
| # frequency swap gives me a KL divergence of 0.18 - this is significant at p = | ||||
| # 0.008 - (remember, Information is 0 in this case). So that's actually quite a | ||||
| # nice addition to the toolbox. | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "FND-STA-Information_theory.R" | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the FND-STA-Information_theory unit. | ||||
| # | ||||
| # Version:  0.2.1 | ||||
| # | ||||
| # Date:     2017 - 2021 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           0.2.1  Maintenance | ||||
| #           0.2    Under development | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                  Line | ||||
| #TOC> -------------------------------------- | ||||
| #TOC>   1        ___Section___            39 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  ___Section___  ======================================================= | ||||
|  | ||||
| # What level of information is "significant" | ||||
|  | ||||
| # Assume the background distribution is the database frequencies of | ||||
| # amino acids: | ||||
|  | ||||
| AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to | ||||
| # sum to 1.0 | ||||
| AAref["A"] <- 0.0904 | ||||
| AAref["C"] <- 0.0123 | ||||
| AAref["D"] <- 0.0545 | ||||
| AAref["E"] <- 0.0617 | ||||
| AAref["F"] <- 0.0394 | ||||
| AAref["G"] <- 0.0724 | ||||
| AAref["H"] <- 0.0221 | ||||
| AAref["I"] <- 0.0573 | ||||
| AAref["K"] <- 0.0504 | ||||
| AAref["L"] <- 0.0986 | ||||
| AAref["M"] <- 0.0240 | ||||
| AAref["N"] <- 0.0392 | ||||
| AAref["P"] <- 0.0486 | ||||
| AAref["Q"] <- 0.0381 | ||||
| AAref["R"] <- 0.0570 | ||||
| AAref["S"] <- 0.0673 | ||||
| AAref["T"] <- 0.0558 | ||||
| AAref["V"] <- 0.0686 | ||||
| AAref["W"] <- 0.0129 | ||||
| AAref["Y"] <- 0.0294 | ||||
| sum(AAref) | ||||
|  | ||||
| # Function to calculate Shannon entropy | ||||
| H <- function(pmf) { | ||||
|   # Calculate Shannon entropy | ||||
|   # Parameters: | ||||
|   #   pmf (numeric) probability mass function: a vector of states and | ||||
|   #                 associated probabilities. Each element of | ||||
|   #                 pmf must be in (0, 1] and sum(pmf) must be 1. | ||||
|   # Value: | ||||
|   #   Shannon entropy in bits. | ||||
|   # Examples: | ||||
|   #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random | ||||
|   #                                         # nucleotide sequence | ||||
|   #   H(1)     # If all elements are the same, entropy is zero | ||||
|   # | ||||
|   if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) { | ||||
|     stop("Input is not a discrete probability distribution.") | ||||
|   } | ||||
|   H <- -sum(pmf * (log(pmf) / log(2))) | ||||
|   return(H) | ||||
| } | ||||
|  | ||||
| # Why use all.equal()? Exact comparisons with floating point numbers are | ||||
| # brittle. Consider for example: | ||||
| 1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1 | ||||
| print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777 | ||||
| # all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8 | ||||
|  | ||||
|  | ||||
|  | ||||
| # Entropy of the database frequencies (in bits): | ||||
| (Href <- H(AAref)) | ||||
|  | ||||
| # for comparison: entropy if all amino acids are equiprobable | ||||
| H(rep(0.05, 20)) | ||||
|  | ||||
|  | ||||
| # Set up a simulation to estimate the distribution of Information values | ||||
| # from random sequences drawn from AAref. This is the distribution for the | ||||
| # statistical null hypothesis: | ||||
| nObs <- 15                      # number of observations (e.g aligned sequences) | ||||
| # nObs <- 80 | ||||
| nTrials <- 10000                # number of trials | ||||
| IObs <- numeric(nTrials)        # vector to store Information in each trial | ||||
| simCounts <- numeric(20)        # vector to tabulate our information ... | ||||
| names(simCounts) <- names(AAref)# ... with the names of AAref | ||||
|  | ||||
|  | ||||
| for (i in 1:nTrials) {  # simulate ... | ||||
|  | ||||
|   # sample AAref letters, nObs times, with the probabilities of AAref: | ||||
|   AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) | ||||
|  | ||||
|   x <- table(AAobs)                            # table simulated observations | ||||
|   simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 | ||||
|   simCounts[names(x)] <- x                     # overwrite with observed counts | ||||
|   simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts | ||||
|   Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H | ||||
|   IObs[i] <- Href - Hobs                       # store information | ||||
| } | ||||
|  | ||||
| # evaluate | ||||
| hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25) | ||||
| abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC") | ||||
|  | ||||
| # The purple lines are drawn at the 5% quantiles of the Iobs distributions - | ||||
| # i.e. an actual observation that lies outside the purple lines is deemed | ||||
| # "significant"(1)(2). Of course, this is only true to the degree that the | ||||
| # database frequencies are a valid model for the null-hypothesis on the | ||||
| # sequence position we are considering here. | ||||
|  | ||||
| #  (1) If we use 5% quantiles, this means a value is significantly larger | ||||
| #      than expected, and we ignore cases when the value is < 0; if we | ||||
| #      consider both smaller and larger values, we need to use 2.5% quantiles, | ||||
| #      since 5% of all observations lie outside the 0.025 and 0.975 | ||||
| #      quantiles. | ||||
| # | ||||
| #  (2) For an actual observation of counts, we calculate its observed | ||||
| #      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1). | ||||
|  | ||||
|  | ||||
| # You can probably now appreciate that information is a bit of a shortcut for | ||||
| # biological sequences, and does not really take the different inherent | ||||
| # frequencies based on the character of the amino acids into account. For | ||||
| # example, L is the most frequent and C is the least frequent, but if we have an | ||||
| # alignment of 1000 sequences and we see that the frequencies for L and C are | ||||
| # swapped, that would be _very_ surprising - nevertheless, the information would | ||||
| # be 0. In order to take that into account, we should actually compute | ||||
| # Kullback-Leibler divergences. | ||||
|  | ||||
|  | ||||
| # Swap C and L frequencies | ||||
| p <- AAref | ||||
| q <- AAref | ||||
| q["L"] <- AAref["C"] | ||||
| q["C"] <- AAref["L"] | ||||
| H(p) | ||||
| H(q) | ||||
|  | ||||
| KLdiv <- function(p, q) { | ||||
|   # p and q are two pmfs of discrete probability distributions | ||||
|   # with the same outcomes, which are nowhere 0. | ||||
|   # Value:  Kullback-Leibler divergence  sum(p * log( p / q))). | ||||
|  | ||||
|   if (length(p) != length(q)) { | ||||
|     stop("PANIC: input vector lengths differ!") | ||||
|   } | ||||
|   if (any(c((p == 0), (q == 0)))) { | ||||
|     stop("PANIC: 0's found in input vectors!") | ||||
|   } | ||||
|  | ||||
|   return(sum(p * log( p / q ))) | ||||
| } | ||||
|  | ||||
| KLdiv(p, p) | ||||
| KLdiv(p, q) | ||||
|  | ||||
|  | ||||
| nObs <- 15                      # number of observations (e.g aligned sequences) | ||||
| # nObs <- 80 | ||||
| nTrials <- 10000                # number of trials | ||||
| KLdivObs <- numeric(nTrials)        # vector to store Information in each trial | ||||
| simCounts <- numeric(20)        # vector to tabulate our information ... | ||||
| names(simCounts) <- names(AAref)# ... with the names of AAref | ||||
|  | ||||
|  | ||||
| for (i in 1:nTrials) {  # simulate ... | ||||
|  | ||||
|   # sample AAref letters, nObs times, with the probabilities of AAref: | ||||
|   AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) | ||||
|  | ||||
|   x <- table(AAobs)                            # table simulated observations | ||||
|   simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 | ||||
|   simCounts[names(x)] <- x                     # overwrite with observed counts | ||||
|   simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts | ||||
|   simCounts <- simCounts/sum(simCounts)        # counts to frequency | ||||
|   KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv | ||||
| } | ||||
|  | ||||
| # evaluate | ||||
| hist(KLdivObs, col = "#C9F4E3", breaks = 25) | ||||
| abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC") | ||||
| quantile(KLdivObs, 0.992) | ||||
|  | ||||
| # Running the simulation with KL does not give a fundamentally | ||||
| # different behaviour - since we are just randomly sampling. But KL would be | ||||
| # more sensitive in case there is biological selection, where the sampling is no | ||||
| # longer random. If I run the same simulation, with nObs <- 80 but calculating | ||||
| # KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L | ||||
| # frequency swap gives me a KL divergence of 0.18 - this is significant at p = | ||||
| # 0.008 - (remember, Information is 0 in this case). So that's actually quite a | ||||
| # nice addition to the toolbox. | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,351 +1,351 @@ | ||||
| # tocID <- "FND-STA-Significance.R" | ||||
| # | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the FND-STA-Significance unit. | ||||
| # | ||||
| # Version:  1.3 | ||||
| # | ||||
| # Date:     2017-09  - 2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.3    2020 Maintenance. Add sample solution. | ||||
| #           1.2    Update set.seed() usage | ||||
| #           1.1    Corrected treatment of empirical p-value | ||||
| #           1.0    First contents | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                              Line | ||||
| #TOC> ------------------------------------------------------------------ | ||||
| #TOC>   1        Significance and p-value                             49 | ||||
| #TOC>   1.1        Significance levels                                60 | ||||
| #TOC>   1.2        probability and p-value                            77 | ||||
| #TOC>   1.2.1          p-value illustrated                           109 | ||||
| #TOC>   2        One- or two-sided                                   165 | ||||
| #TOC>   3        Significance by integration                         209 | ||||
| #TOC>   4        Significance by simulation or permutation           215 | ||||
| #TOC>   5        Final tasks                                         327 | ||||
| #TOC>   6        Sample solutions                                    336 | ||||
| #TOC>   6.1                                                          338 | ||||
| #TOC>   6.2                                                          342 | ||||
| #TOC>   6.3                                                          346 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Significance and p-value  ============================================ | ||||
|  | ||||
| # The idea of the probability of an event has a precise mathematical | ||||
| # interpretation, but how is it useful to know the probability? Usually we are | ||||
| # interested in whether we should accept or reject a hypothesis based on the | ||||
| # observations we have. A rational way to do this is to say: if the probability | ||||
| # of observing the data is very small under the null-hypothesis, then we will | ||||
| # assume the observation is due to something other than the null-hypothesis. But | ||||
| # what do we mean by the "probability of our observation"? And what is "very | ||||
| # small"? | ||||
|  | ||||
| # ==   1.1  Significance levels  =============================================== | ||||
|  | ||||
| # A "very small" probability is purely a matter of convention - a cultural | ||||
| # convention. In the biomedical field we usually call probabilities of less then | ||||
| # 0.05 (5%) small enough to reject the null-hypothesis. Thus we call | ||||
| # observations with a probability of less than 0.05 "significant" and if we want | ||||
| # to highlight this in text or in a graph, we often mark them with an asterisk | ||||
| # (*). Also we often call observations with a probability of less than 0.01 | ||||
| # "highly significant" and mark them with two asterisks (**). But there is no | ||||
| # special significance in these numbers, the cutoff point for significance could | ||||
| # also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the | ||||
| # British statistician Ronald Fisher happened to propose for this purpose in | ||||
| # 1925. Incidentally, Fisher later recommended to use different cutoffs for | ||||
| # different purposes (cf. | ||||
| # https://en.wikipedia.org/wiki/Statistical_significance). | ||||
|  | ||||
|  | ||||
| # ==   1.2  probability and p-value  =========================================== | ||||
|  | ||||
| # But what do we even mean by the probability of an observation? | ||||
| # Assume I am drawing samples from a normal distribution with a mean of 0 and a | ||||
| # standard deviation of 1. The sample I get is ... | ||||
|  | ||||
| set.seed(sqrt(5)) | ||||
| x <- rnorm(1) | ||||
| set.seed(NULL) | ||||
|  | ||||
| print(x, digits = 22) | ||||
| # [1] -0.8969145466249813791748 | ||||
|  | ||||
| # So what's the probability of that number? Obviously, the probability of | ||||
| # getting exactly this number is very, very, very small. But also obviously, | ||||
| # this does not mean that observing this number is in any way significant - we | ||||
| # always observe some number. That's not what we mean in this case. There are | ||||
| # several implicit assumptions when we speak of the probability of an | ||||
| # observation: | ||||
|  | ||||
| # 1: the observation can be compared to a probability distribution; | ||||
| # 2: that distribution can be integrated between any specific value | ||||
| #      and its upper and lower bounds (or +- infinity). | ||||
|  | ||||
| # Then what we really mean by the probability of an observation in the context | ||||
| # of that distribution is: the probability of observing that value, or a value | ||||
| # more extreme than the one we have. We call this the p-value. Note that we are | ||||
| # not talking about an individual number anymore, we are talking about the area | ||||
| # under the curve between our observation and the upper (or lower) bound of the | ||||
| # curve, as a fraction of the whole. | ||||
|  | ||||
|  | ||||
| # ===   1.2.1  p-value illustrated                       | ||||
|  | ||||
| # Let's illustrate. First we draw a million random values from our | ||||
| # standard, normal distribution: | ||||
|  | ||||
| N <- 1e6                             # one million | ||||
| set.seed(112358)                     # set RNG seed for repeatable randomness | ||||
| r <- rnorm(N)                        # N values from a normal distribution | ||||
| set.seed(NULL)                       # reset the RNG | ||||
|  | ||||
| # Let's see what the distribution looks like: | ||||
|  | ||||
| (h <- hist(r)) | ||||
|  | ||||
| # The histogram details are now available in the list h -  e.g. h$counts | ||||
|  | ||||
| # Where is the value we have drawn previously? | ||||
| abline(v = x, col = "#EE0000") | ||||
|  | ||||
| # How many values are smaller? | ||||
| sum(r < x) | ||||
|  | ||||
| # Let's color the bars: | ||||
| #    first, make a vector of red and green colors for the bars with breaks | ||||
| #    smaller and larger then x, white for the bar that contains x ... | ||||
| hCol <- rep("#EE000044", sum(h$breaks < x) - 1) | ||||
| hCol <- c(hCol, "#FFFFFFFF") | ||||
| hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1)) | ||||
| # ... then plot the histogram, with colored bars ... | ||||
| hist(r, col = hCol) | ||||
| # ... add two colored rectangles into the white bar ... | ||||
| idx <- sum(h$breaks < x) | ||||
| xMin <- h$breaks[idx] | ||||
| xMax <- h$breaks[idx + 1] | ||||
| y <- h$counts[idx] | ||||
| rect(xMin, 0, x, y, col = "#EE000044", border = TRUE) | ||||
| rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE) | ||||
| # ... and a red line for our observation. | ||||
| abline(v = x, col = "#EE0000", lwd = 2) | ||||
|  | ||||
| # The p-value of our observation is the red area as a fraction of the | ||||
| # whole histogram (red + green). | ||||
|  | ||||
|  | ||||
| # Task: | ||||
| #    Explain how the expression sum(r < x) works to give us a count of values | ||||
| #    with the property we are looking for. E.g., examine -4:4 < x | ||||
|  | ||||
| # Task: | ||||
| #    Write an expression to estimate the probability that a value | ||||
| #    drawn from the vector r is less-or-equal to x. The result you get | ||||
| #    will depend on the exact values that went into the vector r but it should | ||||
| #    be close to 0.185  That expression is the p-value associated with x. | ||||
| #    (Sample solution 6.1) | ||||
|  | ||||
|  | ||||
| # =    2  One- or two-sided  =================================================== | ||||
|  | ||||
| # The shape of our histogram confirms that the rnorm() function has returned | ||||
| # values that appear distributed according to a normal distribution. In a normal | ||||
| # distribution, readily available tables tell us that 5% of the values (i.e. our | ||||
| # significance level) lie 1.96 (or approximately 2) standard deviations away | ||||
| # from the mean. Is this the case here? How many values in our vector r are | ||||
| # larger than 1.96? | ||||
|  | ||||
| sum(r > 1.96) | ||||
| # [1] 24589 | ||||
|  | ||||
| # Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why? | ||||
|  | ||||
| # The answer is: we have to be careful with two-sided distributions. 2 standard | ||||
| # deviations away from the mean means either larger or smaller than 1.96 . This | ||||
| # can give rise to errors. If we are simply are interested in outliers, no | ||||
| # matter larger or smaller, then the 1.96 SD cutoff for significance is correct. | ||||
| # But if we are specifically interested in, say, larger values, because a | ||||
| # smaller value is not meaningful, then the significance cutoff, expressed as | ||||
| # standard deviations, is relaxed. We can use the quantile function to see what | ||||
| # the cutoff values are: | ||||
|  | ||||
| quantile(r) | ||||
| quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries | ||||
| # close to ± 1.96, as expected | ||||
| quantile(r, probs = 0.95) # for the single 5% boundary | ||||
| # close to 1.64 . Check counts to confirm: | ||||
| sum(r > quantile(r, probs = 0.95)) | ||||
| # [1] 50000 | ||||
| # which is 5%, as expected. | ||||
|  | ||||
| # Task: | ||||
| # Use abline() to add the p = 0.05 boundary for smaller values to the histogram. | ||||
| # (Sample solution 6.2) | ||||
|  | ||||
| # To summarize: when we evaluate the significance of an event, we divide a | ||||
| # probability distribution into two parts at the point where the event was | ||||
| # observed. We then ask whether the integral over the more extreme part is less | ||||
| # or more than 5% of the whole. If it is less, we deem the event to be | ||||
| # significant. | ||||
| # | ||||
|  | ||||
|  | ||||
| # =    3  Significance by integration  ========================================= | ||||
|  | ||||
| # If the underlying probability distribution can be analytically or numerically | ||||
| # integrated, the siginificance of an observation can be directly computed. | ||||
|  | ||||
|  | ||||
| # =    4  Significance by simulation or permutation  =========================== | ||||
|  | ||||
| # But whether the integration is correct, or relies on assumptions that may not | ||||
| # be warranted for biological data, can be a highly technical question. | ||||
| # Fortunately, we can often simply run a simulation, a random resampling, or a | ||||
| # permutation and then count the number of outcomes, just as we did with our | ||||
| # rnorm() samples. We call this an empirical p-value. (Actually, the "empirical | ||||
| # p-value" is defined as (Nobs + 1) / (N + 1).  ) | ||||
|  | ||||
| # Here is an example. Assume you have a protein sequence and | ||||
| # you speculate that positively charged residues are close to negatively charged | ||||
| # residues to balance charge locally. A statistic that would capture this is the | ||||
| # mean minimum distance between all D,E residues and the closest R,K,H | ||||
| # residue. Let's compute this for the sequence of yeast Mbp1. | ||||
|  | ||||
| MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK", | ||||
|                "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA", | ||||
|                "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR", | ||||
|                "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ", | ||||
|                "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS", | ||||
|                "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY", | ||||
|                "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", | ||||
|                "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP", | ||||
|                "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT", | ||||
|                "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP", | ||||
|                "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK", | ||||
|                "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR", | ||||
|                "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK", | ||||
|                "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA") | ||||
|  | ||||
| # first we split this string into individual characters: | ||||
| v <- unlist(strsplit(MBP1, "")) | ||||
|  | ||||
| # and find the positions of our charged residues | ||||
|  | ||||
| ED  <- grep("[ED]", v) | ||||
| RKH <- grep("[RKH]", v) | ||||
|  | ||||
| sep <- numeric(length(ED)) # this vector will hold the distances | ||||
| for (i in seq_along(ED)) { | ||||
|   sep[i] <- min(abs(RKH - ED[i])) | ||||
| } | ||||
|  | ||||
| # Task: read and explain this bit of code | ||||
|  | ||||
| # Now that sep is computed, what does it look like? | ||||
|  | ||||
| table(sep)  # these are the minimum distances | ||||
| # 24 of D,E residues are adjacent to R,K,H; | ||||
| # the longest separation is 28 residues. | ||||
|  | ||||
| # What is the mean separation? | ||||
| mean(sep) | ||||
|  | ||||
| # The value is 4.1 . Is this significant? Honestly, I would be hard pressed | ||||
| # to solve this analytically. But by permutation it's soooo easy. | ||||
|  | ||||
| # First, we combine what we have done above into a function: | ||||
|  | ||||
| chSep <- function(v) { | ||||
|   # computes the mean minimum separation of oppositely charged residues | ||||
|   # Parameter: v (char) a vector of amino acids in the one-letter code | ||||
|   # Value: msep (numeric) mean minimum separation | ||||
|  | ||||
|   ED  <- grep("[EDed]", v) | ||||
|   RKH <- grep("[RKHrkh]", v) | ||||
|  | ||||
|   sep <- numeric(length(ED)) | ||||
|   for (i in seq_along(ED)) { | ||||
|     sep[i] <- min(abs(RKH - ED[i])) | ||||
|   } | ||||
|   return(mean(sep)) | ||||
| } | ||||
|  | ||||
| # Execute the function to define it. | ||||
|  | ||||
| # Confirm that the function gives the same result as the number we | ||||
| # calculated above: | ||||
| chSep(v) | ||||
|  | ||||
| # Now we can produce a random permutation of v, and recalculate | ||||
|  | ||||
| set.seed(pi)                       # set RNG seed for repeatable randomness | ||||
| w <- sample(v, length(v))          # This shuffles the vector v. Memorize this | ||||
|                                    # code paradigm. It is very useful. | ||||
| set.seed(NULL)                     # reset the RNG | ||||
|  | ||||
|  | ||||
|  | ||||
| chSep(w) | ||||
| # 3.773 ... that's actually less than what we had before. | ||||
|  | ||||
| # Let's do this 10000 times and record the results (takes a few seconds): | ||||
|  | ||||
| N <- 10000 | ||||
| chs <- numeric(N) | ||||
| for (i in 1:N) { | ||||
|   chs[i] <- chSep(sample(v, length(v))) # charge | ||||
| } | ||||
|  | ||||
| hist(chs, breaks = 50) | ||||
| abline(v = chSep(v), col = "#EE0000") | ||||
|  | ||||
| # Contrary to our expectations, the actual observed mean minimum charge | ||||
| # separation seems to be larger than what we observe in randomly permuted | ||||
| # sequences. But is this significant? Your task to find out. | ||||
|  | ||||
| # Task: | ||||
| # Calculate the empirical p-value for chsep(v) | ||||
| # (Sample solution 6.3) | ||||
|  | ||||
|  | ||||
| # =    5  Final tasks  ========================================================= | ||||
|  | ||||
| # From chs, compute the empirical p-value of a mean minimum charge separation to | ||||
| #   be larger or equal to the value observed for the yeast MBP1 sequence. Note | ||||
| #   the result in your journal. Is it significant? Also note the result of | ||||
| #   the following expression for validation: | ||||
| seal(sum(chs)) | ||||
|  | ||||
|  | ||||
| # =    6  Sample solutions  ==================================================== | ||||
|  | ||||
| # ==   6.1    ================================================================== | ||||
| # | ||||
| sum(r <= x) / length(r) | ||||
|  | ||||
| # ==   6.2    ================================================================== | ||||
| # | ||||
| abline(v = quantile(r, probs = c(0.05))) | ||||
|  | ||||
| # ==   6.3    ================================================================== | ||||
| # | ||||
| ( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) ) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "FND-STA-Significance.R" | ||||
| # | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the FND-STA-Significance unit. | ||||
| # | ||||
| # Version:  1.3 | ||||
| # | ||||
| # Date:     2017-09  - 2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.3    2020 Maintenance. Add sample solution. | ||||
| #           1.2    Update set.seed() usage | ||||
| #           1.1    Corrected treatment of empirical p-value | ||||
| #           1.0    First contents | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                              Line | ||||
| #TOC> ------------------------------------------------------------------ | ||||
| #TOC>   1        Significance and p-value                             49 | ||||
| #TOC>   1.1        Significance levels                                60 | ||||
| #TOC>   1.2        probability and p-value                            77 | ||||
| #TOC>   1.2.1          p-value illustrated                           109 | ||||
| #TOC>   2        One- or two-sided                                   165 | ||||
| #TOC>   3        Significance by integration                         209 | ||||
| #TOC>   4        Significance by simulation or permutation           215 | ||||
| #TOC>   5        Final tasks                                         327 | ||||
| #TOC>   6        Sample solutions                                    336 | ||||
| #TOC>   6.1                                                          338 | ||||
| #TOC>   6.2                                                          342 | ||||
| #TOC>   6.3                                                          346 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Significance and p-value  ============================================ | ||||
|  | ||||
| # The idea of the probability of an event has a precise mathematical | ||||
| # interpretation, but how is it useful to know the probability? Usually we are | ||||
| # interested in whether we should accept or reject a hypothesis based on the | ||||
| # observations we have. A rational way to do this is to say: if the probability | ||||
| # of observing the data is very small under the null-hypothesis, then we will | ||||
| # assume the observation is due to something other than the null-hypothesis. But | ||||
| # what do we mean by the "probability of our observation"? And what is "very | ||||
| # small"? | ||||
|  | ||||
| # ==   1.1  Significance levels  =============================================== | ||||
|  | ||||
| # A "very small" probability is purely a matter of convention - a cultural | ||||
| # convention. In the biomedical field we usually call probabilities of less then | ||||
| # 0.05 (5%) small enough to reject the null-hypothesis. Thus we call | ||||
| # observations with a probability of less than 0.05 "significant" and if we want | ||||
| # to highlight this in text or in a graph, we often mark them with an asterisk | ||||
| # (*). Also we often call observations with a probability of less than 0.01 | ||||
| # "highly significant" and mark them with two asterisks (**). But there is no | ||||
| # special significance in these numbers, the cutoff point for significance could | ||||
| # also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the | ||||
| # British statistician Ronald Fisher happened to propose for this purpose in | ||||
| # 1925. Incidentally, Fisher later recommended to use different cutoffs for | ||||
| # different purposes (cf. | ||||
| # https://en.wikipedia.org/wiki/Statistical_significance). | ||||
|  | ||||
|  | ||||
| # ==   1.2  probability and p-value  =========================================== | ||||
|  | ||||
| # But what do we even mean by the probability of an observation? | ||||
| # Assume I am drawing samples from a normal distribution with a mean of 0 and a | ||||
| # standard deviation of 1. The sample I get is ... | ||||
|  | ||||
| set.seed(sqrt(5)) | ||||
| x <- rnorm(1) | ||||
| set.seed(NULL) | ||||
|  | ||||
| print(x, digits = 22) | ||||
| # [1] -0.8969145466249813791748 | ||||
|  | ||||
| # So what's the probability of that number? Obviously, the probability of | ||||
| # getting exactly this number is very, very, very small. But also obviously, | ||||
| # this does not mean that observing this number is in any way significant - we | ||||
| # always observe some number. That's not what we mean in this case. There are | ||||
| # several implicit assumptions when we speak of the probability of an | ||||
| # observation: | ||||
|  | ||||
| # 1: the observation can be compared to a probability distribution; | ||||
| # 2: that distribution can be integrated between any specific value | ||||
| #      and its upper and lower bounds (or +- infinity). | ||||
|  | ||||
| # Then what we really mean by the probability of an observation in the context | ||||
| # of that distribution is: the probability of observing that value, or a value | ||||
| # more extreme than the one we have. We call this the p-value. Note that we are | ||||
| # not talking about an individual number anymore, we are talking about the area | ||||
| # under the curve between our observation and the upper (or lower) bound of the | ||||
| # curve, as a fraction of the whole. | ||||
|  | ||||
|  | ||||
| # ===   1.2.1  p-value illustrated                       | ||||
|  | ||||
| # Let's illustrate. First we draw a million random values from our | ||||
| # standard, normal distribution: | ||||
|  | ||||
| N <- 1e6                             # one million | ||||
| set.seed(112358)                     # set RNG seed for repeatable randomness | ||||
| r <- rnorm(N)                        # N values from a normal distribution | ||||
| set.seed(NULL)                       # reset the RNG | ||||
|  | ||||
| # Let's see what the distribution looks like: | ||||
|  | ||||
| (h <- hist(r)) | ||||
|  | ||||
| # The histogram details are now available in the list h -  e.g. h$counts | ||||
|  | ||||
| # Where is the value we have drawn previously? | ||||
| abline(v = x, col = "#EE0000") | ||||
|  | ||||
| # How many values are smaller? | ||||
| sum(r < x) | ||||
|  | ||||
| # Let's color the bars: | ||||
| #    first, make a vector of red and green colors for the bars with breaks | ||||
| #    smaller and larger then x, white for the bar that contains x ... | ||||
| hCol <- rep("#EE000044", sum(h$breaks < x) - 1) | ||||
| hCol <- c(hCol, "#FFFFFFFF") | ||||
| hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1)) | ||||
| # ... then plot the histogram, with colored bars ... | ||||
| hist(r, col = hCol) | ||||
| # ... add two colored rectangles into the white bar ... | ||||
| idx <- sum(h$breaks < x) | ||||
| xMin <- h$breaks[idx] | ||||
| xMax <- h$breaks[idx + 1] | ||||
| y <- h$counts[idx] | ||||
| rect(xMin, 0, x, y, col = "#EE000044", border = TRUE) | ||||
| rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE) | ||||
| # ... and a red line for our observation. | ||||
| abline(v = x, col = "#EE0000", lwd = 2) | ||||
|  | ||||
| # The p-value of our observation is the red area as a fraction of the | ||||
| # whole histogram (red + green). | ||||
|  | ||||
|  | ||||
| # Task: | ||||
| #    Explain how the expression sum(r < x) works to give us a count of values | ||||
| #    with the property we are looking for. E.g., examine -4:4 < x | ||||
|  | ||||
| # Task: | ||||
| #    Write an expression to estimate the probability that a value | ||||
| #    drawn from the vector r is less-or-equal to x. The result you get | ||||
| #    will depend on the exact values that went into the vector r but it should | ||||
| #    be close to 0.185  That expression is the p-value associated with x. | ||||
| #    (Sample solution 6.1) | ||||
|  | ||||
|  | ||||
| # =    2  One- or two-sided  =================================================== | ||||
|  | ||||
| # The shape of our histogram confirms that the rnorm() function has returned | ||||
| # values that appear distributed according to a normal distribution. In a normal | ||||
| # distribution, readily available tables tell us that 5% of the values (i.e. our | ||||
| # significance level) lie 1.96 (or approximately 2) standard deviations away | ||||
| # from the mean. Is this the case here? How many values in our vector r are | ||||
| # larger than 1.96? | ||||
|  | ||||
| sum(r > 1.96) | ||||
| # [1] 24589 | ||||
|  | ||||
| # Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why? | ||||
|  | ||||
| # The answer is: we have to be careful with two-sided distributions. 2 standard | ||||
| # deviations away from the mean means either larger or smaller than 1.96 . This | ||||
| # can give rise to errors. If we are simply are interested in outliers, no | ||||
| # matter larger or smaller, then the 1.96 SD cutoff for significance is correct. | ||||
| # But if we are specifically interested in, say, larger values, because a | ||||
| # smaller value is not meaningful, then the significance cutoff, expressed as | ||||
| # standard deviations, is relaxed. We can use the quantile function to see what | ||||
| # the cutoff values are: | ||||
|  | ||||
| quantile(r) | ||||
| quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries | ||||
| # close to ± 1.96, as expected | ||||
| quantile(r, probs = 0.95) # for the single 5% boundary | ||||
| # close to 1.64 . Check counts to confirm: | ||||
| sum(r > quantile(r, probs = 0.95)) | ||||
| # [1] 50000 | ||||
| # which is 5%, as expected. | ||||
|  | ||||
| # Task: | ||||
| # Use abline() to add the p = 0.05 boundary for smaller values to the histogram. | ||||
| # (Sample solution 6.2) | ||||
|  | ||||
| # To summarize: when we evaluate the significance of an event, we divide a | ||||
| # probability distribution into two parts at the point where the event was | ||||
| # observed. We then ask whether the integral over the more extreme part is less | ||||
| # or more than 5% of the whole. If it is less, we deem the event to be | ||||
| # significant. | ||||
| # | ||||
|  | ||||
|  | ||||
| # =    3  Significance by integration  ========================================= | ||||
|  | ||||
| # If the underlying probability distribution can be analytically or numerically | ||||
| # integrated, the siginificance of an observation can be directly computed. | ||||
|  | ||||
|  | ||||
| # =    4  Significance by simulation or permutation  =========================== | ||||
|  | ||||
| # But whether the integration is correct, or relies on assumptions that may not | ||||
| # be warranted for biological data, can be a highly technical question. | ||||
| # Fortunately, we can often simply run a simulation, a random resampling, or a | ||||
| # permutation and then count the number of outcomes, just as we did with our | ||||
| # rnorm() samples. We call this an empirical p-value. (Actually, the "empirical | ||||
| # p-value" is defined as (Nobs + 1) / (N + 1).  ) | ||||
|  | ||||
| # Here is an example. Assume you have a protein sequence and | ||||
| # you speculate that positively charged residues are close to negatively charged | ||||
| # residues to balance charge locally. A statistic that would capture this is the | ||||
| # mean minimum distance between all D,E residues and the closest R,K,H | ||||
| # residue. Let's compute this for the sequence of yeast Mbp1. | ||||
|  | ||||
| MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK", | ||||
|                "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA", | ||||
|                "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR", | ||||
|                "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ", | ||||
|                "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS", | ||||
|                "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY", | ||||
|                "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", | ||||
|                "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP", | ||||
|                "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT", | ||||
|                "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP", | ||||
|                "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK", | ||||
|                "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR", | ||||
|                "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK", | ||||
|                "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA") | ||||
|  | ||||
| # first we split this string into individual characters: | ||||
| v <- unlist(strsplit(MBP1, "")) | ||||
|  | ||||
| # and find the positions of our charged residues | ||||
|  | ||||
| ED  <- grep("[ED]", v) | ||||
| RKH <- grep("[RKH]", v) | ||||
|  | ||||
| sep <- numeric(length(ED)) # this vector will hold the distances | ||||
| for (i in seq_along(ED)) { | ||||
|   sep[i] <- min(abs(RKH - ED[i])) | ||||
| } | ||||
|  | ||||
| # Task: read and explain this bit of code | ||||
|  | ||||
| # Now that sep is computed, what does it look like? | ||||
|  | ||||
| table(sep)  # these are the minimum distances | ||||
| # 24 of D,E residues are adjacent to R,K,H; | ||||
| # the longest separation is 28 residues. | ||||
|  | ||||
| # What is the mean separation? | ||||
| mean(sep) | ||||
|  | ||||
| # The value is 4.1 . Is this significant? Honestly, I would be hard pressed | ||||
| # to solve this analytically. But by permutation it's soooo easy. | ||||
|  | ||||
| # First, we combine what we have done above into a function: | ||||
|  | ||||
| chSep <- function(v) { | ||||
|   # computes the mean minimum separation of oppositely charged residues | ||||
|   # Parameter: v (char) a vector of amino acids in the one-letter code | ||||
|   # Value: msep (numeric) mean minimum separation | ||||
|  | ||||
|   ED  <- grep("[EDed]", v) | ||||
|   RKH <- grep("[RKHrkh]", v) | ||||
|  | ||||
|   sep <- numeric(length(ED)) | ||||
|   for (i in seq_along(ED)) { | ||||
|     sep[i] <- min(abs(RKH - ED[i])) | ||||
|   } | ||||
|   return(mean(sep)) | ||||
| } | ||||
|  | ||||
| # Execute the function to define it. | ||||
|  | ||||
| # Confirm that the function gives the same result as the number we | ||||
| # calculated above: | ||||
| chSep(v) | ||||
|  | ||||
| # Now we can produce a random permutation of v, and recalculate | ||||
|  | ||||
| set.seed(pi)                       # set RNG seed for repeatable randomness | ||||
| w <- sample(v, length(v))          # This shuffles the vector v. Memorize this | ||||
|                                    # code paradigm. It is very useful. | ||||
| set.seed(NULL)                     # reset the RNG | ||||
|  | ||||
|  | ||||
|  | ||||
| chSep(w) | ||||
| # 3.773 ... that's actually less than what we had before. | ||||
|  | ||||
| # Let's do this 10000 times and record the results (takes a few seconds): | ||||
|  | ||||
| N <- 10000 | ||||
| chs <- numeric(N) | ||||
| for (i in 1:N) { | ||||
|   chs[i] <- chSep(sample(v, length(v))) # charge | ||||
| } | ||||
|  | ||||
| hist(chs, breaks = 50) | ||||
| abline(v = chSep(v), col = "#EE0000") | ||||
|  | ||||
| # Contrary to our expectations, the actual observed mean minimum charge | ||||
| # separation seems to be larger than what we observe in randomly permuted | ||||
| # sequences. But is this significant? Your task to find out. | ||||
|  | ||||
| # Task: | ||||
| # Calculate the empirical p-value for chsep(v) | ||||
| # (Sample solution 6.3) | ||||
|  | ||||
|  | ||||
| # =    5  Final tasks  ========================================================= | ||||
|  | ||||
| # From chs, compute the empirical p-value of a mean minimum charge separation to | ||||
| #   be larger or equal to the value observed for the yeast MBP1 sequence. Note | ||||
| #   the result in your journal. Is it significant? Also note the result of | ||||
| #   the following expression for validation: | ||||
| seal(sum(chs)) | ||||
|  | ||||
|  | ||||
| # =    6  Sample solutions  ==================================================== | ||||
|  | ||||
| # ==   6.1    ================================================================== | ||||
| # | ||||
| sum(r <= x) / length(r) | ||||
|  | ||||
| # ==   6.2    ================================================================== | ||||
| # | ||||
| abline(v = quantile(r, probs = c(0.05))) | ||||
|  | ||||
| # ==   6.3    ================================================================== | ||||
| # | ||||
| ( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) ) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,3 +1,3 @@ | ||||
| # BCH441-WORK-ABC-units | ||||
|  | ||||
| # BCH441-WORK-ABC-units | ||||
|  | ||||
| This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date. | ||||
							
								
								
									
										490
									
								
								RPR-Biostrings.R
									
									
									
									
									
								
							
							
						
						
									
										490
									
								
								RPR-Biostrings.R
									
									
									
									
									
								
							| @@ -1,245 +1,245 @@ | ||||
| # tocID <- "RPR-Biostrings.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Biostrings unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Updates | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0    2017 Revisions | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                             Line | ||||
| #TOC> ----------------------------------------------------------------- | ||||
| #TOC>   1        The Biostrings:: Package                            56 | ||||
| #TOC>   2        Getting Data into Biostrings:: Objects              88 | ||||
| #TOC>   3        Working with Biostrings:: Objects                  110 | ||||
| #TOC>   3.1        Properties                                       127 | ||||
| #TOC>   3.2        Subsetting                                       168 | ||||
| #TOC>   3.3        Operators                                        180 | ||||
| #TOC>   3.4        Transformations                                  187 | ||||
| #TOC>   4        Getting Data out of Biostrings:: Objects           194 | ||||
| #TOC>   5        More                                               203 | ||||
| #TOC>   5.1        Views                                            205 | ||||
| #TOC>   5.2        Iranges                                          219 | ||||
| #TOC>   5.3        StringSets                                       225 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # This is a very brief introduction to the Biostrings:: package, other units will | ||||
| # be using more of the Biostrings:: functions. | ||||
|  | ||||
|  | ||||
| # =    1  The Biostrings:: Package  ============================================ | ||||
|  | ||||
|  | ||||
| # First, we install and load the Biostrings:: package from bioconductor (if we | ||||
| # haven't done so already). | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Examine the package information: | ||||
| library(help = Biostrings)       # basic information | ||||
| browseVignettes("Biostrings")    # available vignettes | ||||
| data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # At its core, Biostrings:: objects are "classes" of type XString (you can think | ||||
| # of a "class" in R as a special kind of list), that can take on particular | ||||
| # flavours for RNA, DNA or amino acid sequence information. | ||||
|  | ||||
| class(Biostrings::RNAString("AUG")) | ||||
| class(Biostrings::DNAString("ATG")) | ||||
| class(Biostrings::AAString("M")) | ||||
|  | ||||
| # An essential property of Biostrings:: objects is that they only allow letters | ||||
| # from the applicable IUPAC alphabet: | ||||
| Biostrings::RNAString("AUG") | ||||
| Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes | ||||
|  | ||||
|  | ||||
| # =    2  Getting Data into Biostrings:: Objects  ============================== | ||||
|  | ||||
|  | ||||
| # Example: read FASTA. Extract sequence. Convert to DNAString object. | ||||
| rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||
| rawSeq <- dbSanitizeSequence(rawSeq) | ||||
| biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence | ||||
|                                             # into an object of class DNAstring | ||||
|  | ||||
| # Multi FASTA files can be read directly as a "XStringSet) ... | ||||
| rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa" | ||||
| (biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile)) | ||||
|  | ||||
| # ... and if you subset one sequence from the set, you get an XString object | ||||
| # back again. | ||||
| (Xseq <- biosDNASet[[1]]) | ||||
|  | ||||
| biosDNAseq == Xseq           # the comparison evaluates to TRUE ... | ||||
| identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical. | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    3  Working with Biostrings:: Objects  =================================== | ||||
|  | ||||
| # Biostrings:: is a highly engineered package that is tightly integrated into | ||||
| # the Bioconductor world - unfortunately that brings with it a somewhat | ||||
| # undesirable level of computational overhead and dependencies. Using the | ||||
| # package as we normally do - i.e. calling required functions with their | ||||
| # explicit package prefix is therefore not advisable. There are generics | ||||
| # that won't be propery dispatched. If you only need a small number of | ||||
| # functions for a very specific context, you will probably get away with | ||||
| # Biostrings::<function>() - but even in the demonstration code of this script | ||||
| # not everything works out of the box. We'll therefore load the library, | ||||
| # but we'll (redundantly) use the prefix anyway so as to emphasize where | ||||
| # the functions come from. | ||||
|  | ||||
| library(Biostrings) | ||||
|  | ||||
|  | ||||
| # ==   3.1  Properties  ======================================================== | ||||
| str(rawSeq) | ||||
| str(biosDNAseq) | ||||
|  | ||||
| length(rawSeq)       # ... is 1: one string only. To get the number of | ||||
|                      # characters in a string, you need nchar(). | ||||
| length(biosDNAseq)   # but the length of a "Bstring" is the number of elements | ||||
| nchar(rawSeq) | ||||
| nchar(biosDNAseq)    # ... but nchar() works too. | ||||
|  | ||||
| (uL <- Biostrings::uniqueLetters(biosDNAseq)) | ||||
|  | ||||
| # Count frequencies - with strings, you would strsplit() into a character | ||||
| # vector and then use table(). biost | ||||
| Biostrings::alphabetFrequency(biosDNAseq) | ||||
|  | ||||
| # letterFrequency() works with a defined alphabet - such as what uniqueLetters() | ||||
| # returns. | ||||
| Biostrings::letterFrequency(biosDNAseq, uL) | ||||
| sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) / | ||||
|   length(biosDNAseq) # GC contents | ||||
|  | ||||
| Biostrings::dinucleotideFrequency(biosDNAseq) | ||||
| barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5) | ||||
|  | ||||
| (triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq)) | ||||
| barplot(sort(triNuc), col="#4499EE33") | ||||
| triNuc[triNuc == max(triNuc)] | ||||
| triNuc[triNuc == min(triNuc)] | ||||
| max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT | ||||
|  | ||||
| # compare to a shuffled sequence: | ||||
| (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) | ||||
| barplot(sort(triNuc), col="#EEEE4433", add = TRUE) | ||||
| max(triNuc) | ||||
| # Interpret this plot. | ||||
| (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) | ||||
| barplot(sort(triNuc), col="#EEEE4433") | ||||
| max(triNuc) | ||||
|  | ||||
|  | ||||
| # ==   3.2  Subsetting  ======================================================== | ||||
|  | ||||
| # Subsetting any XString object works as expected: | ||||
| biosDNAseq[4:15] | ||||
|  | ||||
| # ... well - maybe not expected, because rawSeq[4:15] would not work. | ||||
|  | ||||
| # Alternatively to the "[" operator, use the subseq() function - especially for | ||||
| # long sequences. This is far more efficient. | ||||
| Biostrings::subseq(biosDNAseq, start = 1, end = 30) | ||||
|  | ||||
|  | ||||
| # ==   3.3  Operators  ========================================================= | ||||
|  | ||||
| # RNAstring() and DNAstring() objects compare U and T as equals! | ||||
|   Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == | ||||
|   Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") | ||||
|  | ||||
|  | ||||
| # ==   3.4  Transformations  =================================================== | ||||
|  | ||||
| biosDNAseq[4:15] | ||||
| Biostrings::reverseComplement(biosDNAseq[4:15]) | ||||
| Biostrings::translate(biosDNAseq[4:15]) | ||||
|  | ||||
|  | ||||
| # =    4  Getting Data out of Biostrings:: Objects  ============================ | ||||
|  | ||||
| # If you need a character object, use toString(): | ||||
|  | ||||
| Biostrings::toString(biosDNAseq[4:15]) | ||||
|  | ||||
| # saveRDS() and readRDS() works like on all other R objects. | ||||
|  | ||||
|  | ||||
| # =    5  More  ================================================================ | ||||
|  | ||||
| # ==   5.1  Views  ============================================================= | ||||
|  | ||||
| # Biostring "Views" are objects that store multiple substrings of one | ||||
| # Biostring object. | ||||
|  | ||||
| (myView <- Biostrings::Views(biosDNAseq, | ||||
|                              start = c(1, 19, 37), | ||||
|                              end = c(15, 30, 45))) | ||||
|  | ||||
| # Views are convenient to store feature annotations | ||||
| names(myView) <- c("Feature-A", "Feature-B", "Feature-C") | ||||
| cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView )) | ||||
|  | ||||
|  | ||||
| # ==   5.2  Iranges  =========================================================== | ||||
|  | ||||
| # Biostrings:: Iranges are like Views with a common start point. These can be | ||||
| # useful for feature annotations. Instead of start/end you store start/width. | ||||
|  | ||||
|  | ||||
| # ==   5.3  StringSets  ======================================================== | ||||
|  | ||||
| # Biostring "StringSets" store multiple sequences. | ||||
| # | ||||
| ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA") | ||||
| sample(ompA) # sample can work directly on a Biostring object to shuffle it | ||||
|  | ||||
| x <- Biostrings::toString(ompA) | ||||
| for (i in 2:10) { | ||||
|   x[i] <- Biostrings::toString(sample(ompA)) | ||||
| } | ||||
| shuffledPeptideSet <- Biostrings::AAStringSet(x) | ||||
| names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep="")) | ||||
| shuffledPeptideSet | ||||
|  | ||||
| length(shuffledPeptideSet) | ||||
| Biostrings::width(shuffledPeptideSet) | ||||
| Biostrings::alphabetFrequency(shuffledPeptideSet) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-Biostrings.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Biostrings unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Updates | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.0    2017 Revisions | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                             Line | ||||
| #TOC> ----------------------------------------------------------------- | ||||
| #TOC>   1        The Biostrings:: Package                            56 | ||||
| #TOC>   2        Getting Data into Biostrings:: Objects              88 | ||||
| #TOC>   3        Working with Biostrings:: Objects                  110 | ||||
| #TOC>   3.1        Properties                                       127 | ||||
| #TOC>   3.2        Subsetting                                       168 | ||||
| #TOC>   3.3        Operators                                        180 | ||||
| #TOC>   3.4        Transformations                                  187 | ||||
| #TOC>   4        Getting Data out of Biostrings:: Objects           194 | ||||
| #TOC>   5        More                                               203 | ||||
| #TOC>   5.1        Views                                            205 | ||||
| #TOC>   5.2        Iranges                                          219 | ||||
| #TOC>   5.3        StringSets                                       225 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # This is a very brief introduction to the Biostrings:: package, other units will | ||||
| # be using more of the Biostrings:: functions. | ||||
|  | ||||
|  | ||||
| # =    1  The Biostrings:: Package  ============================================ | ||||
|  | ||||
|  | ||||
| # First, we install and load the Biostrings:: package from bioconductor (if we | ||||
| # haven't done so already). | ||||
|  | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Examine the package information: | ||||
| library(help = Biostrings)       # basic information | ||||
| browseVignettes("Biostrings")    # available vignettes | ||||
| data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # At its core, Biostrings:: objects are "classes" of type XString (you can think | ||||
| # of a "class" in R as a special kind of list), that can take on particular | ||||
| # flavours for RNA, DNA or amino acid sequence information. | ||||
|  | ||||
| class(Biostrings::RNAString("AUG")) | ||||
| class(Biostrings::DNAString("ATG")) | ||||
| class(Biostrings::AAString("M")) | ||||
|  | ||||
| # An essential property of Biostrings:: objects is that they only allow letters | ||||
| # from the applicable IUPAC alphabet: | ||||
| Biostrings::RNAString("AUG") | ||||
| Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes | ||||
|  | ||||
|  | ||||
| # =    2  Getting Data into Biostrings:: Objects  ============================== | ||||
|  | ||||
|  | ||||
| # Example: read FASTA. Extract sequence. Convert to DNAString object. | ||||
| rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||
| rawSeq <- dbSanitizeSequence(rawSeq) | ||||
| biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence | ||||
|                                             # into an object of class DNAstring | ||||
|  | ||||
| # Multi FASTA files can be read directly as a "XStringSet) ... | ||||
| rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa" | ||||
| (biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile)) | ||||
|  | ||||
| # ... and if you subset one sequence from the set, you get an XString object | ||||
| # back again. | ||||
| (Xseq <- biosDNASet[[1]]) | ||||
|  | ||||
| biosDNAseq == Xseq           # the comparison evaluates to TRUE ... | ||||
| identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical. | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    3  Working with Biostrings:: Objects  =================================== | ||||
|  | ||||
| # Biostrings:: is a highly engineered package that is tightly integrated into | ||||
| # the Bioconductor world - unfortunately that brings with it a somewhat | ||||
| # undesirable level of computational overhead and dependencies. Using the | ||||
| # package as we normally do - i.e. calling required functions with their | ||||
| # explicit package prefix is therefore not advisable. There are generics | ||||
| # that won't be propery dispatched. If you only need a small number of | ||||
| # functions for a very specific context, you will probably get away with | ||||
| # Biostrings::<function>() - but even in the demonstration code of this script | ||||
| # not everything works out of the box. We'll therefore load the library, | ||||
| # but we'll (redundantly) use the prefix anyway so as to emphasize where | ||||
| # the functions come from. | ||||
|  | ||||
| library(Biostrings) | ||||
|  | ||||
|  | ||||
| # ==   3.1  Properties  ======================================================== | ||||
| str(rawSeq) | ||||
| str(biosDNAseq) | ||||
|  | ||||
| length(rawSeq)       # ... is 1: one string only. To get the number of | ||||
|                      # characters in a string, you need nchar(). | ||||
| length(biosDNAseq)   # but the length of a "Bstring" is the number of elements | ||||
| nchar(rawSeq) | ||||
| nchar(biosDNAseq)    # ... but nchar() works too. | ||||
|  | ||||
| (uL <- Biostrings::uniqueLetters(biosDNAseq)) | ||||
|  | ||||
| # Count frequencies - with strings, you would strsplit() into a character | ||||
| # vector and then use table(). biost | ||||
| Biostrings::alphabetFrequency(biosDNAseq) | ||||
|  | ||||
| # letterFrequency() works with a defined alphabet - such as what uniqueLetters() | ||||
| # returns. | ||||
| Biostrings::letterFrequency(biosDNAseq, uL) | ||||
| sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) / | ||||
|   length(biosDNAseq) # GC contents | ||||
|  | ||||
| Biostrings::dinucleotideFrequency(biosDNAseq) | ||||
| barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5) | ||||
|  | ||||
| (triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq)) | ||||
| barplot(sort(triNuc), col="#4499EE33") | ||||
| triNuc[triNuc == max(triNuc)] | ||||
| triNuc[triNuc == min(triNuc)] | ||||
| max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT | ||||
|  | ||||
| # compare to a shuffled sequence: | ||||
| (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) | ||||
| barplot(sort(triNuc), col="#EEEE4433", add = TRUE) | ||||
| max(triNuc) | ||||
| # Interpret this plot. | ||||
| (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) | ||||
| barplot(sort(triNuc), col="#EEEE4433") | ||||
| max(triNuc) | ||||
|  | ||||
|  | ||||
| # ==   3.2  Subsetting  ======================================================== | ||||
|  | ||||
| # Subsetting any XString object works as expected: | ||||
| biosDNAseq[4:15] | ||||
|  | ||||
| # ... well - maybe not expected, because rawSeq[4:15] would not work. | ||||
|  | ||||
| # Alternatively to the "[" operator, use the subseq() function - especially for | ||||
| # long sequences. This is far more efficient. | ||||
| Biostrings::subseq(biosDNAseq, start = 1, end = 30) | ||||
|  | ||||
|  | ||||
| # ==   3.3  Operators  ========================================================= | ||||
|  | ||||
| # RNAstring() and DNAstring() objects compare U and T as equals! | ||||
|   Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == | ||||
|   Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") | ||||
|  | ||||
|  | ||||
| # ==   3.4  Transformations  =================================================== | ||||
|  | ||||
| biosDNAseq[4:15] | ||||
| Biostrings::reverseComplement(biosDNAseq[4:15]) | ||||
| Biostrings::translate(biosDNAseq[4:15]) | ||||
|  | ||||
|  | ||||
| # =    4  Getting Data out of Biostrings:: Objects  ============================ | ||||
|  | ||||
| # If you need a character object, use toString(): | ||||
|  | ||||
| Biostrings::toString(biosDNAseq[4:15]) | ||||
|  | ||||
| # saveRDS() and readRDS() works like on all other R objects. | ||||
|  | ||||
|  | ||||
| # =    5  More  ================================================================ | ||||
|  | ||||
| # ==   5.1  Views  ============================================================= | ||||
|  | ||||
| # Biostring "Views" are objects that store multiple substrings of one | ||||
| # Biostring object. | ||||
|  | ||||
| (myView <- Biostrings::Views(biosDNAseq, | ||||
|                              start = c(1, 19, 37), | ||||
|                              end = c(15, 30, 45))) | ||||
|  | ||||
| # Views are convenient to store feature annotations | ||||
| names(myView) <- c("Feature-A", "Feature-B", "Feature-C") | ||||
| cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView )) | ||||
|  | ||||
|  | ||||
| # ==   5.2  Iranges  =========================================================== | ||||
|  | ||||
| # Biostrings:: Iranges are like Views with a common start point. These can be | ||||
| # useful for feature annotations. Instead of start/end you store start/width. | ||||
|  | ||||
|  | ||||
| # ==   5.3  StringSets  ======================================================== | ||||
|  | ||||
| # Biostring "StringSets" store multiple sequences. | ||||
| # | ||||
| ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA") | ||||
| sample(ompA) # sample can work directly on a Biostring object to shuffle it | ||||
|  | ||||
| x <- Biostrings::toString(ompA) | ||||
| for (i in 2:10) { | ||||
|   x[i] <- Biostrings::toString(sample(ompA)) | ||||
| } | ||||
| shuffledPeptideSet <- Biostrings::AAStringSet(x) | ||||
| names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep="")) | ||||
| shuffledPeptideSet | ||||
|  | ||||
| length(shuffledPeptideSet) | ||||
| Biostrings::width(shuffledPeptideSet) | ||||
| Biostrings::alphabetFrequency(shuffledPeptideSet) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,165 +1,165 @@ | ||||
| # tocID <- "RPR-ChimeraX_remote.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code demonstrating remote scripting of ChimeraX. | ||||
| # | ||||
| # Version:  1.0.1 | ||||
| # | ||||
| # Date:     2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.0.1  2021 Minimal updates | ||||
| #           1.0    First ABC units version | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #    %-encode and escape quotes, or just pass-through? | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                  Line | ||||
| #TOC> ------------------------------------------------------ | ||||
| #TOC>   1        ChimeraX REMOTE SCRIPTING                41 | ||||
| #TOC>   1.1        Defining a Port                        59 | ||||
| #TOC>   1.2        Open ChimeraX                          81 | ||||
| #TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  ChimeraX REMOTE SCRIPTING  =========================================== | ||||
|  | ||||
|  | ||||
| # One of the cool features of ChimeraX is that it can be driven by Python code, | ||||
| # both within a running session and through Python scripts. What I find even | ||||
| # cooler though is that ChimeraX can be driven from any programming language via | ||||
| # its remote control function that can listen to commands sent from any other | ||||
| # application. The interface that is used here is the standard REST (method) - | ||||
| # the GET and POST verbs that ubiquitously underly the communication of clients | ||||
| # and servers on the Web. | ||||
|  | ||||
| # In order to establish the communication between this script and ChimeraX, all | ||||
| # we need to do is: | ||||
| #  - open ChimeraX; | ||||
| #  - tell it to listen on a specific "port"; | ||||
| #  - send commands to that port via httr:: | ||||
|  | ||||
|  | ||||
| # ==   1.1  Defining a Port  =================================================== | ||||
|  | ||||
| # The httr:: package needs to be available | ||||
|  | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = httr)       # basic information | ||||
| #  browseVignettes("httr")    # available vignettes | ||||
| #  data(package = "httr")     # available datasets | ||||
|  | ||||
| # We need to think od a port. Any available port number between 49152-65535 is | ||||
| # fine. We'll choose 61803 because that's the fractional part of the golden | ||||
| # ratio. But one could choose another. | ||||
|  | ||||
| CXPORT <- 61803 | ||||
|  | ||||
| # Check that our current version of R supports sockets (default since V 3.3) | ||||
| capabilities("sockets")   # MUST be TRUE. If not, don't continue. | ||||
|  | ||||
|  | ||||
| # ==   1.2  Open ChimeraX  ===================================================== | ||||
|  | ||||
| #  - Open a fresh, new session of recently updated version of ChimeraX | ||||
| #  - type: | ||||
| # | ||||
| #       remotecontrol rest start port 61803 | ||||
| # | ||||
| #    ... or whatever the value of CXPORT is. | ||||
|  | ||||
| # Now watch what happens in ChimeraX when you execute the following line: | ||||
| ( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") ) | ||||
|  | ||||
| # The .utilities.R script includes the function CX(), based on this principle, | ||||
| # through which you can send commands to ChimeraX | ||||
|  | ||||
| CX("camera sbs") | ||||
| CX("lighting soft") | ||||
| CX("color sequential #1 & protein target abc palette powderblue:orchid:white") | ||||
|  | ||||
| # The command echos Chimera's response if the parameter "quietly" is | ||||
| # FALSE (default), and we can silence output with quietly = TRUE : | ||||
| CX("info models #1 attribute num_residues") | ||||
| CX("info models #1 attribute num_residues", quietly = TRUE) | ||||
|  | ||||
| # Either way, the command also returns Chimera's responses "invisibly"; | ||||
| # i.e. we can use the results by assigning the output to a variable: | ||||
| hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE) | ||||
| x <- read.table(file = textConnection(hBonds), skip = 9, | ||||
|                 blank.lines.skip = TRUE, fill = TRUE) | ||||
| hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff") | ||||
|  | ||||
|  | ||||
| # =    2  WORKED EXAMPLE: SUPERPOSITION  ======================================= | ||||
|  | ||||
| # We superimpose the 1BM8 structure with the 1DUX crystal structure to be able | ||||
| # to explore possible DNA binding regions in 1BM8 | ||||
|  | ||||
| # The model for 1BM8 is already open as model 1  (#1) | ||||
| CX("hide #1 cartoons")        # hide model 1 cartoon representation | ||||
| CX("open 1DUX")               # assume this is opened as model #2 | ||||
| CX("hide #2")                 # hide everything ... | ||||
| CX("select #2/C")             # chain c (protein) | ||||
| CX("show sel cartoons")       # ... and show cartoons of chain c (protein) | ||||
| CX("color sequential sel target c palette steelblue:darkmagenta") | ||||
| CX("view #2/C")               # re-center the display | ||||
| CX("cofr #2/C:62@CA")         # set pivot to an interface residue | ||||
| CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA | ||||
| CX("style sel stick") | ||||
| CX("show sel target ab")      # show atoms/bonds | ||||
| CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan") | ||||
| CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan") | ||||
| CX("surface sel enclose sel") # compute joint accessible surface of both chains | ||||
| CX("transparency 50") | ||||
| CX("select clear") | ||||
|  | ||||
| # Now superimpose the 1BM8 chain onto 1DUX chain C | ||||
| CX("show #1 cartoons") | ||||
| CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition | ||||
|  | ||||
| # study the general layout, and the position of the 1mb8 secondary structure | ||||
| # elements relative to 1DUX | ||||
|  | ||||
| # Let's examine side chain orientations in more detail | ||||
| CX("hide #2/C cartoons")  # hide the 1DUX protein | ||||
|  | ||||
| # select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b) | ||||
| CX("select zone #2/A,B 3.5 #1 & protein residues true") | ||||
| CX("~select sel & H")  # de-select H atoms | ||||
| CX("show sel target ab") | ||||
| CX("size stickRadius 0.4") | ||||
| CX("select clear") | ||||
|  | ||||
| # The overall architecture of the Mbp1 APSES domain is a good match for the Elk | ||||
| # transcription factor binding mode; the detailed conformations of side chains | ||||
| # would need to change only to a minor degree. There is a very significant | ||||
| # degree of structural similarity; remarkable, given that the DNA is not the | ||||
| # target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was | ||||
| # determined without a DNA ligand. | ||||
|  | ||||
| CX("remotecontrol rest stop")  # release the socket | ||||
| # Done. | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-ChimeraX_remote.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code demonstrating remote scripting of ChimeraX. | ||||
| # | ||||
| # Version:  1.0.1 | ||||
| # | ||||
| # Date:     2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.0.1  2021 Minimal updates | ||||
| #           1.0    First ABC units version | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #    %-encode and escape quotes, or just pass-through? | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                  Line | ||||
| #TOC> ------------------------------------------------------ | ||||
| #TOC>   1        ChimeraX REMOTE SCRIPTING                41 | ||||
| #TOC>   1.1        Defining a Port                        59 | ||||
| #TOC>   1.2        Open ChimeraX                          81 | ||||
| #TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  ChimeraX REMOTE SCRIPTING  =========================================== | ||||
|  | ||||
|  | ||||
| # One of the cool features of ChimeraX is that it can be driven by Python code, | ||||
| # both within a running session and through Python scripts. What I find even | ||||
| # cooler though is that ChimeraX can be driven from any programming language via | ||||
| # its remote control function that can listen to commands sent from any other | ||||
| # application. The interface that is used here is the standard REST (method) - | ||||
| # the GET and POST verbs that ubiquitously underly the communication of clients | ||||
| # and servers on the Web. | ||||
|  | ||||
| # In order to establish the communication between this script and ChimeraX, all | ||||
| # we need to do is: | ||||
| #  - open ChimeraX; | ||||
| #  - tell it to listen on a specific "port"; | ||||
| #  - send commands to that port via httr:: | ||||
|  | ||||
|  | ||||
| # ==   1.1  Defining a Port  =================================================== | ||||
|  | ||||
| # The httr:: package needs to be available | ||||
|  | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = httr)       # basic information | ||||
| #  browseVignettes("httr")    # available vignettes | ||||
| #  data(package = "httr")     # available datasets | ||||
|  | ||||
| # We need to think od a port. Any available port number between 49152-65535 is | ||||
| # fine. We'll choose 61803 because that's the fractional part of the golden | ||||
| # ratio. But one could choose another. | ||||
|  | ||||
| CXPORT <- 61803 | ||||
|  | ||||
| # Check that our current version of R supports sockets (default since V 3.3) | ||||
| capabilities("sockets")   # MUST be TRUE. If not, don't continue. | ||||
|  | ||||
|  | ||||
| # ==   1.2  Open ChimeraX  ===================================================== | ||||
|  | ||||
| #  - Open a fresh, new session of recently updated version of ChimeraX | ||||
| #  - type: | ||||
| # | ||||
| #       remotecontrol rest start port 61803 | ||||
| # | ||||
| #    ... or whatever the value of CXPORT is. | ||||
|  | ||||
| # Now watch what happens in ChimeraX when you execute the following line: | ||||
| ( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") ) | ||||
|  | ||||
| # The .utilities.R script includes the function CX(), based on this principle, | ||||
| # through which you can send commands to ChimeraX | ||||
|  | ||||
| CX("camera sbs") | ||||
| CX("lighting soft") | ||||
| CX("color sequential #1 & protein target abc palette powderblue:orchid:white") | ||||
|  | ||||
| # The command echos Chimera's response if the parameter "quietly" is | ||||
| # FALSE (default), and we can silence output with quietly = TRUE : | ||||
| CX("info models #1 attribute num_residues") | ||||
| CX("info models #1 attribute num_residues", quietly = TRUE) | ||||
|  | ||||
| # Either way, the command also returns Chimera's responses "invisibly"; | ||||
| # i.e. we can use the results by assigning the output to a variable: | ||||
| hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE) | ||||
| x <- read.table(file = textConnection(hBonds), skip = 9, | ||||
|                 blank.lines.skip = TRUE, fill = TRUE) | ||||
| hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff") | ||||
|  | ||||
|  | ||||
| # =    2  WORKED EXAMPLE: SUPERPOSITION  ======================================= | ||||
|  | ||||
| # We superimpose the 1BM8 structure with the 1DUX crystal structure to be able | ||||
| # to explore possible DNA binding regions in 1BM8 | ||||
|  | ||||
| # The model for 1BM8 is already open as model 1  (#1) | ||||
| CX("hide #1 cartoons")        # hide model 1 cartoon representation | ||||
| CX("open 1DUX")               # assume this is opened as model #2 | ||||
| CX("hide #2")                 # hide everything ... | ||||
| CX("select #2/C")             # chain c (protein) | ||||
| CX("show sel cartoons")       # ... and show cartoons of chain c (protein) | ||||
| CX("color sequential sel target c palette steelblue:darkmagenta") | ||||
| CX("view #2/C")               # re-center the display | ||||
| CX("cofr #2/C:62@CA")         # set pivot to an interface residue | ||||
| CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA | ||||
| CX("style sel stick") | ||||
| CX("show sel target ab")      # show atoms/bonds | ||||
| CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan") | ||||
| CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan") | ||||
| CX("surface sel enclose sel") # compute joint accessible surface of both chains | ||||
| CX("transparency 50") | ||||
| CX("select clear") | ||||
|  | ||||
| # Now superimpose the 1BM8 chain onto 1DUX chain C | ||||
| CX("show #1 cartoons") | ||||
| CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition | ||||
|  | ||||
| # study the general layout, and the position of the 1mb8 secondary structure | ||||
| # elements relative to 1DUX | ||||
|  | ||||
| # Let's examine side chain orientations in more detail | ||||
| CX("hide #2/C cartoons")  # hide the 1DUX protein | ||||
|  | ||||
| # select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b) | ||||
| CX("select zone #2/A,B 3.5 #1 & protein residues true") | ||||
| CX("~select sel & H")  # de-select H atoms | ||||
| CX("show sel target ab") | ||||
| CX("size stickRadius 0.4") | ||||
| CX("select clear") | ||||
|  | ||||
| # The overall architecture of the Mbp1 APSES domain is a good match for the Elk | ||||
| # transcription factor binding mode; the detailed conformations of side chains | ||||
| # would need to change only to a minor degree. There is a very significant | ||||
| # degree of structural similarity; remarkable, given that the DNA is not the | ||||
| # target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was | ||||
| # determined without a DNA ligand. | ||||
|  | ||||
| CX("remotecontrol rest stop")  # release the socket | ||||
| # Done. | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										644
									
								
								RPR-FASTA.R
									
									
									
									
									
								
							
							
						
						
									
										644
									
								
								RPR-FASTA.R
									
									
									
									
									
								
							| @@ -1,322 +1,322 @@ | ||||
| # tocID <- "RPR-FASTA.R" | ||||
| # | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-FASTA unit. | ||||
| # | ||||
| # Version:  1.1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2021-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.1.2  style update | ||||
| #           1.1.1  bugfix - wrong function name | ||||
| #           1.1    2020 Maintenance. Rewrite validation logic. Add data | ||||
| #                  to utilities. Define AACOLS | ||||
| #           1.0    New unit. | ||||
| # | ||||
| # | ||||
| # TODO: Make a simple solution first, then extend it to error checking, and | ||||
| #       to handle .mfa files. | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                 Line | ||||
| #TOC> ----------------------------------------------------- | ||||
| #TOC>   1        Reading and validating FASTA            45 | ||||
| #TOC>   1.1        Validating FASTA                      81 | ||||
| #TOC>   2        Parsing FASTA                          227 | ||||
| #TOC>   3        Interpreting FASTA                     247 | ||||
| #TOC>   4        Writing FASTA                          274 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Reading and validating FASTA  ======================================== | ||||
|  | ||||
| # FASTA is a text based format, structured in lines that are separated by | ||||
| # line-feed or paragraph-break characters. Which one of these is used, depends | ||||
| # on your operating system. But R's readLines() function knows how to handle | ||||
| # these correctly, accross platforms. Don't try to read such files "by hand". | ||||
| # Here is the yeast Mbp1 gene, via SGD. | ||||
|  | ||||
| file.show("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||
| faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||
|  | ||||
| # The warning is generated because the programmer at the NCBI who implemented | ||||
| # the code to write this FASTA file neglected to place a line-break character | ||||
| # after the last sequence character. While this is not technically incorrect, | ||||
| # it is poor practice: the resulting file can't be distinguished from one that | ||||
| # has been truncated in transmission. | ||||
|  | ||||
| head(faMBP1) | ||||
|  | ||||
| # Note that there are NO line-break characters ("\n") at the end of these | ||||
| # strings, even though they were present in the original file. readLines() | ||||
| # has "consumed" these characters while reading - but every single line is in | ||||
| # a vector of its own. | ||||
|  | ||||
| tail(faMBP1) | ||||
|  | ||||
| # Also note that the last line has fewer characters - this means readLines() | ||||
| # imported the whole line, despite it not being terminated by "\n". | ||||
|  | ||||
| # It's very straightforward to work with such data, for example by collapsing | ||||
| # everything except the first line into a single string ... | ||||
|  | ||||
| f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = "")) | ||||
|  | ||||
| f[1] | ||||
| nchar(f[2]) | ||||
|  | ||||
| # ==   1.1  Validating FASTA  ================================================== | ||||
|  | ||||
| # The code above is making the assumption that everything from line 2 until | ||||
| #  the end IS sequence, the whole sequence and nothing but sequence. | ||||
| #  That assumption can break down in many ways: | ||||
| # | ||||
| #  - there could be more than one header line. The specification says otherwise, | ||||
| #       but some older files use multiple, consecutive header lines. You don't | ||||
| #       want that to end up in your sequence. | ||||
| #  - this could be not a FASTA file at all. It could be raw sequence, a | ||||
| #       different sequence file format, or a wholly different file altogether. | ||||
| #       If you look at the file, you can immediately tell, but if you are | ||||
| #       reading the file in a complex workflow, your could easily import wrong | ||||
| #       data into your analysis. | ||||
| #  - there could be more than one sequence in the file. Such Multi-FASTA files | ||||
| #       occur commonly, as downloads of ORFs from genome regions or other | ||||
| #       sets of genes or proteins, or as the input / output for multiple | ||||
| #       sequence alignment programs. | ||||
| # | ||||
| # Data "from the wild" can (and usually does) have the most unexpected | ||||
| # variations and it is really, really important to be clear about the | ||||
| # assumptions that you are making. It is possible to "fix" things, according | ||||
| # to the "Robustness Principle" : | ||||
| #      "Be conservative in what you send, | ||||
| #       be liberal in what you accept". | ||||
| #       (cf. https://en.wikipedia.org/wiki/Robustness_principle ) | ||||
| # ... but if you think about this, that's actually a really poor idea, | ||||
| # which is much more likely to dilute standards, make unwarranted | ||||
| # assumptions, and allow errors to pass silently and corrupt data. | ||||
| # | ||||
| # Let's discard this principle on the trash-heap of | ||||
| # things-that-sound-like-a-good-idea-but-aren't. What we do instead is test, | ||||
| # identify problems, and follow the principle: "crash early, crash often". Of | ||||
| # course I can write code that would reformat any possible input as a FASTA | ||||
| # file - but what good will it do me if it parses the file I receive | ||||
| # from a server into FASTA format like: | ||||
| # | ||||
| #   >404- Page Not Found</title</head> | ||||
| #   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe | ||||
| #   spellingrcntacttheadministratrsdyhtml | ||||
| # | ||||
| # Therefore, we write ourselves a FASTA checker that will enforce the following: | ||||
| #   (1) a FASTA file contains one or more sequences separated by zero or | ||||
| #       more empty lines | ||||
| #   (2) a sequence contains one header line followed by | ||||
| #       one or more sequence lines | ||||
| #   (3) a sequence line contains one or more uppercase or lowercase single | ||||
| #       letter amino acid codes, hyphens (gap character), or * (stop). | ||||
| # | ||||
| #   Anything else should generate an error. | ||||
|  | ||||
| #   (Case 1): Header(s) exist | ||||
| fX <- c("ABC", | ||||
|         "defghi", | ||||
|         "klmnpq") | ||||
| sel <- grepl("^>", fX)  # "^>" is a regular expression that | ||||
|                         # means: the exact character ">" at the | ||||
|                         # beginning ("^") of the line. | ||||
| if ( ! any(sel) ) { stop("no header lines in input.") } | ||||
|  | ||||
|  | ||||
| #   (Case 2) No adjacent header lines | ||||
| fX <- c(">ABC", | ||||
|         ">123", | ||||
|         "defghi", | ||||
|         "klmnpq") | ||||
| sel <- grepl("^>", fX) | ||||
| sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors | ||||
| if ( any(sel)) { stop("adjacent header lines in input.") } | ||||
|  | ||||
| #   (Case 3.1) all sequence lines contain only valid characters | ||||
| #              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG | ||||
| #               are defined with the .utilities.R script) | ||||
| AAVALID | ||||
| fX <- c(">ABC", | ||||
|         "def ;-) ghi", | ||||
|         "klmnpq") | ||||
| myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character | ||||
| sel <- ! grepl("^>", fX)              # NOT headers | ||||
| if (any(grepl(myRegex, fX[sel]))) { | ||||
|   stop("invalid chracter(s) outside of header lines.") | ||||
| } | ||||
|  | ||||
| #   (Case 3.2) all headers are followed directly by | ||||
| #              at least one letter of sequence | ||||
| fX <- c(">ABC", | ||||
|         "", | ||||
|         ">123", | ||||
|         "defghi", | ||||
|         "klmnpq") | ||||
| sel <- grep("^>", fX) + 1             # indexes of headers + 1 | ||||
| myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character | ||||
| if (! all(grepl(myRegex, fX[sel]))) { | ||||
|   stop("a header has no adjacent sequence.") | ||||
| } | ||||
| # Ah, you might ask - couldn't we just have dropped all empty lines, and | ||||
| # then caught this in Case 2? No - for two reasons: we would still miss headers | ||||
| # at the end of file, and, we would have changed the line numbering - and | ||||
| # ideally our "production" function will create information about where the | ||||
| # error is to be found. | ||||
|  | ||||
|  | ||||
| # Now combine this into a function ... | ||||
|  | ||||
| val <- function(fa) { | ||||
|  | ||||
|   if ( ! any(grepl("^>", fa)) ) { | ||||
|     stop("no header lines in input.") | ||||
|   } | ||||
|  | ||||
|   sel <- grepl("^>", fa) | ||||
|   if ( any(sel[- length(sel)] & sel[-1])) { | ||||
|     stop("adjacent header lines in input.") | ||||
|   } | ||||
|  | ||||
|   sel <- ! grepl("^>", fa) | ||||
|   if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) { | ||||
|     stop("invalid chracter(s) outside of header lines.") | ||||
|   } | ||||
|  | ||||
|   sel <- grep("^>", fa) + 1 | ||||
|   if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) { | ||||
|     stop("a header has no adjacent sequence.") | ||||
|   } | ||||
|  | ||||
|   return(invisible(NULL)) | ||||
| } | ||||
|  | ||||
| # Here is an example | ||||
| FA <- c(">head1", | ||||
|         "acdef", | ||||
|         "ghi", | ||||
|         "", | ||||
|         ">head2", | ||||
|         "kl", | ||||
|         ">head3", | ||||
|         "mn", | ||||
|         "pqrs") | ||||
| val(FA)     # ... should not create an error | ||||
|  | ||||
|  | ||||
| # A somewhat more elaborate validateFA() function was loaded with the | ||||
| # ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi- | ||||
| # fasta files have space-characters in their spacer lines. Try it ... | ||||
| validateFA(FA) | ||||
|  | ||||
| # =    2  Parsing FASTA  ======================================================= | ||||
|  | ||||
| # Once we have validated our assumptions about our input, it's quite | ||||
| # painless to parse it. I have put this together as a function and the function | ||||
| # gets loaded from ./.utilities.R | ||||
| # | ||||
|  | ||||
| # Lets try this: | ||||
| #   - the first 3 elements of faMBP1: | ||||
| readFASTA(faMBP1[1:3]) | ||||
|  | ||||
| #   - a multi FASTA file of aligned APSES domain sequences: | ||||
|  | ||||
| refAPSES <- readFASTA("./data/refAPSES.mfa") | ||||
|  | ||||
| # Subset the sequence with "P39678" in the header | ||||
| refAPSES[grep("P39678", refAPSES$head) ,] | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    3  Interpreting FASTA  ================================================== | ||||
|  | ||||
|  | ||||
| # FASTA files are straightforward to interpret - just one thing may be of note: | ||||
| # when working with strings, we can use substr(<string>, <start>, <stop>) to | ||||
| # extract substrings, but more often we expand the string into a vector of | ||||
| # single characters with strsplit(<string>, ""). strsplit() returns a list, | ||||
| # to accommodate that <string> could be a vector of many elements, therefore | ||||
| # we usually unlist() the result if we use it only on a single string. | ||||
|  | ||||
| # Example: How many positive charged residues in "MBP1_SACCE"? | ||||
|  | ||||
| s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], "")) | ||||
| s | ||||
|  | ||||
| sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE | ||||
|                        # for the characters, sum() coerces to 1 and 0 | ||||
|                        # respectively, and that gives us the result. | ||||
|  | ||||
| 100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 % | ||||
|  | ||||
| # residue distribution | ||||
| x <- factor(s, levels = names(AACOLS)) | ||||
| pie(table(x)[names(AACOLS)], col = AACOLS) | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    4  Writing FASTA  ======================================================= | ||||
|  | ||||
|  | ||||
| # Writing FASTA files is mostly just the reverse of reading, with one | ||||
| # twist: we need to break the long sequence string into chunks of the desired | ||||
| # width. The FASTA specification calls for a maximum of 120 characters per line, | ||||
| # but writing out much less than that is common, since it allows to comfortably | ||||
| # view lines on the console, or printing them on a sheet of paper (do we still | ||||
| # do that actually?). How do we break a string into chunks? A combination of | ||||
| # seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work | ||||
| # nicely. (Note that substring() is vectorized, whereas substr() is not!) As we | ||||
| # loop through our FASTA object in memory, we can build the output by c()'ing | ||||
| # blocks of header + sequence to each other. For VERY large objects this might | ||||
| # be slow - in that case, we might want to precalculate the size of the output | ||||
| # object. But that's more of a hypothetical consideration. | ||||
|  | ||||
| ( s <- refAPSES$seq[2] ) | ||||
| nchar(s) | ||||
| w <- 30     # width of chunk | ||||
| (starts <- seq(1, nchar(s), by = w))      # starting index of chunk | ||||
| (ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk | ||||
|  | ||||
| # Task: Is this safe? What happens if nchar(s) is shorter than w? | ||||
| #       What happens if nchar(s) is an exact multiple of w? | ||||
|  | ||||
| substring(s, starts, ends) | ||||
| # confirm that the output contains the first and last residue, and both | ||||
| # residues adjacent to the breaks | ||||
|  | ||||
| # As always, the function has been defined in ".utilities.R" for to use | ||||
| # any time...  type   writeFASTA  to examine it. | ||||
|  | ||||
| # Let's try this... | ||||
|  | ||||
| writeFASTA(refAPSES, width = 40) | ||||
|  | ||||
| # roundtrip for validation: write refAPSES with a different format, | ||||
| # read it back in - the new dataframe must be identical | ||||
| # to the original dataframe. | ||||
| fname <- tempfile() | ||||
| writeFASTA(refAPSES, fn = fname, width = 30) | ||||
| identical(refAPSES, readFASTA(fname)) | ||||
|  | ||||
| # ...works for me  :-) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-FASTA.R" | ||||
| # | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-FASTA unit. | ||||
| # | ||||
| # Version:  1.1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2021-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.1.2  style update | ||||
| #           1.1.1  bugfix - wrong function name | ||||
| #           1.1    2020 Maintenance. Rewrite validation logic. Add data | ||||
| #                  to utilities. Define AACOLS | ||||
| #           1.0    New unit. | ||||
| # | ||||
| # | ||||
| # TODO: Make a simple solution first, then extend it to error checking, and | ||||
| #       to handle .mfa files. | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                 Line | ||||
| #TOC> ----------------------------------------------------- | ||||
| #TOC>   1        Reading and validating FASTA            45 | ||||
| #TOC>   1.1        Validating FASTA                      81 | ||||
| #TOC>   2        Parsing FASTA                          227 | ||||
| #TOC>   3        Interpreting FASTA                     247 | ||||
| #TOC>   4        Writing FASTA                          274 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Reading and validating FASTA  ======================================== | ||||
|  | ||||
| # FASTA is a text based format, structured in lines that are separated by | ||||
| # line-feed or paragraph-break characters. Which one of these is used, depends | ||||
| # on your operating system. But R's readLines() function knows how to handle | ||||
| # these correctly, accross platforms. Don't try to read such files "by hand". | ||||
| # Here is the yeast Mbp1 gene, via SGD. | ||||
|  | ||||
| file.show("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||
| faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||
|  | ||||
| # The warning is generated because the programmer at the NCBI who implemented | ||||
| # the code to write this FASTA file neglected to place a line-break character | ||||
| # after the last sequence character. While this is not technically incorrect, | ||||
| # it is poor practice: the resulting file can't be distinguished from one that | ||||
| # has been truncated in transmission. | ||||
|  | ||||
| head(faMBP1) | ||||
|  | ||||
| # Note that there are NO line-break characters ("\n") at the end of these | ||||
| # strings, even though they were present in the original file. readLines() | ||||
| # has "consumed" these characters while reading - but every single line is in | ||||
| # a vector of its own. | ||||
|  | ||||
| tail(faMBP1) | ||||
|  | ||||
| # Also note that the last line has fewer characters - this means readLines() | ||||
| # imported the whole line, despite it not being terminated by "\n". | ||||
|  | ||||
| # It's very straightforward to work with such data, for example by collapsing | ||||
| # everything except the first line into a single string ... | ||||
|  | ||||
| f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = "")) | ||||
|  | ||||
| f[1] | ||||
| nchar(f[2]) | ||||
|  | ||||
| # ==   1.1  Validating FASTA  ================================================== | ||||
|  | ||||
| # The code above is making the assumption that everything from line 2 until | ||||
| #  the end IS sequence, the whole sequence and nothing but sequence. | ||||
| #  That assumption can break down in many ways: | ||||
| # | ||||
| #  - there could be more than one header line. The specification says otherwise, | ||||
| #       but some older files use multiple, consecutive header lines. You don't | ||||
| #       want that to end up in your sequence. | ||||
| #  - this could be not a FASTA file at all. It could be raw sequence, a | ||||
| #       different sequence file format, or a wholly different file altogether. | ||||
| #       If you look at the file, you can immediately tell, but if you are | ||||
| #       reading the file in a complex workflow, your could easily import wrong | ||||
| #       data into your analysis. | ||||
| #  - there could be more than one sequence in the file. Such Multi-FASTA files | ||||
| #       occur commonly, as downloads of ORFs from genome regions or other | ||||
| #       sets of genes or proteins, or as the input / output for multiple | ||||
| #       sequence alignment programs. | ||||
| # | ||||
| # Data "from the wild" can (and usually does) have the most unexpected | ||||
| # variations and it is really, really important to be clear about the | ||||
| # assumptions that you are making. It is possible to "fix" things, according | ||||
| # to the "Robustness Principle" : | ||||
| #      "Be conservative in what you send, | ||||
| #       be liberal in what you accept". | ||||
| #       (cf. https://en.wikipedia.org/wiki/Robustness_principle ) | ||||
| # ... but if you think about this, that's actually a really poor idea, | ||||
| # which is much more likely to dilute standards, make unwarranted | ||||
| # assumptions, and allow errors to pass silently and corrupt data. | ||||
| # | ||||
| # Let's discard this principle on the trash-heap of | ||||
| # things-that-sound-like-a-good-idea-but-aren't. What we do instead is test, | ||||
| # identify problems, and follow the principle: "crash early, crash often". Of | ||||
| # course I can write code that would reformat any possible input as a FASTA | ||||
| # file - but what good will it do me if it parses the file I receive | ||||
| # from a server into FASTA format like: | ||||
| # | ||||
| #   >404- Page Not Found</title</head> | ||||
| #   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe | ||||
| #   spellingrcntacttheadministratrsdyhtml | ||||
| # | ||||
| # Therefore, we write ourselves a FASTA checker that will enforce the following: | ||||
| #   (1) a FASTA file contains one or more sequences separated by zero or | ||||
| #       more empty lines | ||||
| #   (2) a sequence contains one header line followed by | ||||
| #       one or more sequence lines | ||||
| #   (3) a sequence line contains one or more uppercase or lowercase single | ||||
| #       letter amino acid codes, hyphens (gap character), or * (stop). | ||||
| # | ||||
| #   Anything else should generate an error. | ||||
|  | ||||
| #   (Case 1): Header(s) exist | ||||
| fX <- c("ABC", | ||||
|         "defghi", | ||||
|         "klmnpq") | ||||
| sel <- grepl("^>", fX)  # "^>" is a regular expression that | ||||
|                         # means: the exact character ">" at the | ||||
|                         # beginning ("^") of the line. | ||||
| if ( ! any(sel) ) { stop("no header lines in input.") } | ||||
|  | ||||
|  | ||||
| #   (Case 2) No adjacent header lines | ||||
| fX <- c(">ABC", | ||||
|         ">123", | ||||
|         "defghi", | ||||
|         "klmnpq") | ||||
| sel <- grepl("^>", fX) | ||||
| sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors | ||||
| if ( any(sel)) { stop("adjacent header lines in input.") } | ||||
|  | ||||
| #   (Case 3.1) all sequence lines contain only valid characters | ||||
| #              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG | ||||
| #               are defined with the .utilities.R script) | ||||
| AAVALID | ||||
| fX <- c(">ABC", | ||||
|         "def ;-) ghi", | ||||
|         "klmnpq") | ||||
| myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character | ||||
| sel <- ! grepl("^>", fX)              # NOT headers | ||||
| if (any(grepl(myRegex, fX[sel]))) { | ||||
|   stop("invalid chracter(s) outside of header lines.") | ||||
| } | ||||
|  | ||||
| #   (Case 3.2) all headers are followed directly by | ||||
| #              at least one letter of sequence | ||||
| fX <- c(">ABC", | ||||
|         "", | ||||
|         ">123", | ||||
|         "defghi", | ||||
|         "klmnpq") | ||||
| sel <- grep("^>", fX) + 1             # indexes of headers + 1 | ||||
| myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character | ||||
| if (! all(grepl(myRegex, fX[sel]))) { | ||||
|   stop("a header has no adjacent sequence.") | ||||
| } | ||||
| # Ah, you might ask - couldn't we just have dropped all empty lines, and | ||||
| # then caught this in Case 2? No - for two reasons: we would still miss headers | ||||
| # at the end of file, and, we would have changed the line numbering - and | ||||
| # ideally our "production" function will create information about where the | ||||
| # error is to be found. | ||||
|  | ||||
|  | ||||
| # Now combine this into a function ... | ||||
|  | ||||
| val <- function(fa) { | ||||
|  | ||||
|   if ( ! any(grepl("^>", fa)) ) { | ||||
|     stop("no header lines in input.") | ||||
|   } | ||||
|  | ||||
|   sel <- grepl("^>", fa) | ||||
|   if ( any(sel[- length(sel)] & sel[-1])) { | ||||
|     stop("adjacent header lines in input.") | ||||
|   } | ||||
|  | ||||
|   sel <- ! grepl("^>", fa) | ||||
|   if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) { | ||||
|     stop("invalid chracter(s) outside of header lines.") | ||||
|   } | ||||
|  | ||||
|   sel <- grep("^>", fa) + 1 | ||||
|   if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) { | ||||
|     stop("a header has no adjacent sequence.") | ||||
|   } | ||||
|  | ||||
|   return(invisible(NULL)) | ||||
| } | ||||
|  | ||||
| # Here is an example | ||||
| FA <- c(">head1", | ||||
|         "acdef", | ||||
|         "ghi", | ||||
|         "", | ||||
|         ">head2", | ||||
|         "kl", | ||||
|         ">head3", | ||||
|         "mn", | ||||
|         "pqrs") | ||||
| val(FA)     # ... should not create an error | ||||
|  | ||||
|  | ||||
| # A somewhat more elaborate validateFA() function was loaded with the | ||||
| # ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi- | ||||
| # fasta files have space-characters in their spacer lines. Try it ... | ||||
| validateFA(FA) | ||||
|  | ||||
| # =    2  Parsing FASTA  ======================================================= | ||||
|  | ||||
| # Once we have validated our assumptions about our input, it's quite | ||||
| # painless to parse it. I have put this together as a function and the function | ||||
| # gets loaded from ./.utilities.R | ||||
| # | ||||
|  | ||||
| # Lets try this: | ||||
| #   - the first 3 elements of faMBP1: | ||||
| readFASTA(faMBP1[1:3]) | ||||
|  | ||||
| #   - a multi FASTA file of aligned APSES domain sequences: | ||||
|  | ||||
| refAPSES <- readFASTA("./data/refAPSES.mfa") | ||||
|  | ||||
| # Subset the sequence with "P39678" in the header | ||||
| refAPSES[grep("P39678", refAPSES$head) ,] | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    3  Interpreting FASTA  ================================================== | ||||
|  | ||||
|  | ||||
| # FASTA files are straightforward to interpret - just one thing may be of note: | ||||
| # when working with strings, we can use substr(<string>, <start>, <stop>) to | ||||
| # extract substrings, but more often we expand the string into a vector of | ||||
| # single characters with strsplit(<string>, ""). strsplit() returns a list, | ||||
| # to accommodate that <string> could be a vector of many elements, therefore | ||||
| # we usually unlist() the result if we use it only on a single string. | ||||
|  | ||||
| # Example: How many positive charged residues in "MBP1_SACCE"? | ||||
|  | ||||
| s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], "")) | ||||
| s | ||||
|  | ||||
| sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE | ||||
|                        # for the characters, sum() coerces to 1 and 0 | ||||
|                        # respectively, and that gives us the result. | ||||
|  | ||||
| 100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 % | ||||
|  | ||||
| # residue distribution | ||||
| x <- factor(s, levels = names(AACOLS)) | ||||
| pie(table(x)[names(AACOLS)], col = AACOLS) | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    4  Writing FASTA  ======================================================= | ||||
|  | ||||
|  | ||||
| # Writing FASTA files is mostly just the reverse of reading, with one | ||||
| # twist: we need to break the long sequence string into chunks of the desired | ||||
| # width. The FASTA specification calls for a maximum of 120 characters per line, | ||||
| # but writing out much less than that is common, since it allows to comfortably | ||||
| # view lines on the console, or printing them on a sheet of paper (do we still | ||||
| # do that actually?). How do we break a string into chunks? A combination of | ||||
| # seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work | ||||
| # nicely. (Note that substring() is vectorized, whereas substr() is not!) As we | ||||
| # loop through our FASTA object in memory, we can build the output by c()'ing | ||||
| # blocks of header + sequence to each other. For VERY large objects this might | ||||
| # be slow - in that case, we might want to precalculate the size of the output | ||||
| # object. But that's more of a hypothetical consideration. | ||||
|  | ||||
| ( s <- refAPSES$seq[2] ) | ||||
| nchar(s) | ||||
| w <- 30     # width of chunk | ||||
| (starts <- seq(1, nchar(s), by = w))      # starting index of chunk | ||||
| (ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk | ||||
|  | ||||
| # Task: Is this safe? What happens if nchar(s) is shorter than w? | ||||
| #       What happens if nchar(s) is an exact multiple of w? | ||||
|  | ||||
| substring(s, starts, ends) | ||||
| # confirm that the output contains the first and last residue, and both | ||||
| # residues adjacent to the breaks | ||||
|  | ||||
| # As always, the function has been defined in ".utilities.R" for to use | ||||
| # any time...  type   writeFASTA  to examine it. | ||||
|  | ||||
| # Let's try this... | ||||
|  | ||||
| writeFASTA(refAPSES, width = 40) | ||||
|  | ||||
| # roundtrip for validation: write refAPSES with a different format, | ||||
| # read it back in - the new dataframe must be identical | ||||
| # to the original dataframe. | ||||
| fname <- tempfile() | ||||
| writeFASTA(refAPSES, fn = fname, width = 30) | ||||
| identical(refAPSES, readFASTA(fname)) | ||||
|  | ||||
| # ...works for me  :-) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										1348
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							
							
						
						
									
										1348
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,385 +1,385 @@ | ||||
| # tocID <- "RPR-Genetic_code_optimality.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Genetic_code_optimality unit. | ||||
| # | ||||
| # Version:  1.3 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.3    2020 Maintenance | ||||
| #           1.2    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.1      Update set.seed() usage | ||||
| #           1.0.1    Fixed two bugs discovered by Suan Chin Yeo. | ||||
| #           1.0      New material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                          Line | ||||
| #TOC> -------------------------------------------------------------- | ||||
| #TOC>   1        Designing a computational experiment             58 | ||||
| #TOC>   2        Setting up the tools                             74 | ||||
| #TOC>   2.1        Natural and alternative genetic codes          77 | ||||
| #TOC>   2.2        Effect of mutations                           135 | ||||
| #TOC>   2.2.1          reverse-translate                         146 | ||||
| #TOC>   2.2.2          Randomly mutate                           171 | ||||
| #TOC>   2.2.3          Forward- translate                        196 | ||||
| #TOC>   2.2.4          measure effect                            213 | ||||
| #TOC>   3        Run the experiment                              267 | ||||
| #TOC>   4        Task solutions                                  363 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # This unit demonstrates R code to simulate alternate genetic codes and evaluate | ||||
| # their robsustness to code changes. The approaches are quite simple and you | ||||
| # will be able to come up with obvious refinements; the point of this code is to | ||||
| # demonstrate some R programming techniques, in preparation for more | ||||
| # sophisticated questions later. | ||||
|  | ||||
|  | ||||
| # =    1  Designing a computational experiment  ================================ | ||||
|  | ||||
| # Computational experiments are conducted like wet-lab experiments. We begin | ||||
| # with a hypothesis, then define the observables that relate to the hypothesis, | ||||
| # then define the measures we apply to observations, and finally we interpret | ||||
| # our observations. If we want to learn something about the evolution of the | ||||
| # genetic code ... | ||||
|  | ||||
| #  - we construct a hypothesis such as: the genetic code has evolved so as to | ||||
| #      minimize the effect of mutations; | ||||
| #  - we define the observables: the effect of mutations in | ||||
| #      sequences, given the natural and possible alternative codes; | ||||
| #  - we define the measures to quantify the effect of mutations; | ||||
| #  - then we compute alternatives and interpret the results. | ||||
|  | ||||
|  | ||||
| # =    2  Setting up the tools  ================================================ | ||||
|  | ||||
|  | ||||
| # ==   2.1  Natural and alternative genetic codes  ============================= | ||||
|  | ||||
| # Load genetic code tables from the Biostrings package | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # There are many ways to generate alternative codes. The simplest way is to | ||||
| # randomly assign amino acids to codons. A more sophisticated way is to keep the | ||||
| # redundancy of codons intact, since it may reflect some form of symmetry | ||||
| # breaking that ignores the third nucleotide of a codon for the most part; | ||||
| # therefore we only replace the amino acids of the existing code with random | ||||
| # others. Here are two functions that implement these two ideas about alternate | ||||
| # codes. | ||||
|  | ||||
| randomGC <- function(GC) { | ||||
|   # Return a genetic code with randomly assigned amino acids. | ||||
|   # Parameters: | ||||
|   #    GC   named chr  length-64 character vector of 20 amino acid one-letter | ||||
|   #                       codes plus "*" (stop), named with the codon triplet. | ||||
|   # Value:  named chr  same vector with random amino acid assignments in which | ||||
|   #                       every amino acid and "*" is encoded at least once. | ||||
|  | ||||
|   aa <- unique(GC)                           # the amino acids in the input code | ||||
|   GC[1:64] <- sample(aa, 64, replace = TRUE) # random code | ||||
|   while(length(unique(GC)) < length(aa)) {   # We could end up with a code that | ||||
|                                              # does not contain all amino acids, | ||||
|                                              # then we sample() again. | ||||
|     GC[1:64] <- sample(aa, 64, replace = TRUE) | ||||
|   } | ||||
|   return(GC) | ||||
| } | ||||
|  | ||||
| swappedGC <- function(GC) { | ||||
|   # Return a genetic code with randomly swapped amino acids. | ||||
|   # Parameters: | ||||
|   #    GC   named chr  length-64 character vector of 20 amino acid one-letter | ||||
|   #                       codes plus "*" (stop), named with the codon triplet. | ||||
|   # Value:  named chr  same vector with random amino acid assignments where the | ||||
|   #                       amino acids have been swapped. | ||||
|  | ||||
|   aaOrig <- unique(GC)                       # the amino acids in the input code | ||||
|   aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled | ||||
|   names(aaSwap) <- aaOrig                    # name them after the original | ||||
|   GC[1:64] <- aaSwap[GC]                     # replace original with shuffled | ||||
|  | ||||
|   return(GC) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ==   2.2  Effect of mutations  =============================================== | ||||
|  | ||||
|  | ||||
| # To evaluate the effects of mutations we will do the following: | ||||
| #   - we take an amino acid sequence (Mbp1 will do just nicely); | ||||
| #   - we reverse-translate it into a nucleotide sequence; | ||||
| #   - we mutate it randomly; | ||||
| #   - we translate it back to amino acids; | ||||
| #   - we count the number of mutations and evaluate their severity. | ||||
|  | ||||
|  | ||||
| # ===   2.2.1  reverse-translate                     | ||||
|  | ||||
| # To reverse-translate an amino acid vector, we randomly pick one of its | ||||
| # codons from a genetic code, and assemble all codons to a sequence. | ||||
|  | ||||
| traRev <- function(s, GC) { | ||||
|   # Parameters: | ||||
|   #      s   chr   a sequence vector | ||||
|   #      GC  chr   a genetic code | ||||
|   # Value: | ||||
|   #      A reverse-translated vector of codons | ||||
|   vC <- character(length(s)) | ||||
|  | ||||
|   for (i in seq_along(s)) { | ||||
|     codon <- names(GC)[GC == s[i]]   # get all codons for this AA | ||||
|     if (length(codon) > 1) {         # if there's more than one ... | ||||
|       codon <- sample(codon, 1)      # pick one at random ... | ||||
|     } | ||||
|     vC[i] <- codon                   # store it | ||||
|   } | ||||
|  | ||||
|   return(vC) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ===   2.2.2  Randomly mutate                       | ||||
|  | ||||
| # To mutate, we split a codon into it's three nucleotides, then randomly replace | ||||
| # one of the three with another nucleotide. | ||||
|  | ||||
| randMut <- function(vC) { | ||||
|   # Parameter: | ||||
|   #    vC   chr     a vector of codons | ||||
|   # Value:  chr     a vector of codons with a single point mutation from vC | ||||
|  | ||||
|   nuc <- c("A", "C", "G", "T") | ||||
|  | ||||
|   for (i in seq_along(vC)) { | ||||
|     triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl. | ||||
|     iNuc <- sample(1:3, 1)                         # choose one of the three | ||||
|     mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide | ||||
|     triplet[iNuc] <- mutNuc                        # replace the original | ||||
|     vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon | ||||
|   } | ||||
|   return(vC) | ||||
|  | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # ===   2.2.3  Forward- translate                    | ||||
|  | ||||
| traFor <- function(vC, GC) { | ||||
|   # Parameters: | ||||
|   #      vC   chr   a codon vector | ||||
|   #      GC   chr   a genetic code | ||||
|   # Value: | ||||
|   #      A vector of amino acids | ||||
|   vAA <- character(length(vC)) | ||||
|  | ||||
|   for (i in seq_along(vC)) { | ||||
|     vAA[i] <- GC[vC[i]]         # translate and store | ||||
|   } | ||||
|   return(vAA) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ===   2.2.4  measure effect                        | ||||
|  | ||||
| # How do we evaluate the effect of the mutation? We'll take a simple ad hoc | ||||
| # approach: we divide amino acids into hydrophobic, hydrophilic, and neutral | ||||
| # categories, according to their free energy of transfer from water to octanol: | ||||
| aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") | ||||
| aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") | ||||
| aaNeutral <- c("A", "H", "T", "S", "V", "G") | ||||
|  | ||||
| # Then we will penalize as follows: | ||||
| # Changes within one category: 0.1 | ||||
| # Changes from hydrophobic or hydrophilic to neutral or back: 0.3 | ||||
| # Changes from hydrophobic to hydrophilic or back: 1.0 | ||||
| # Changes to stop-codon: 3.0 | ||||
|  | ||||
| evalMut <- function(nat, mut) { | ||||
|   # Evaluate severity of mutations between amino acid sequence vectors nat and | ||||
|   # mut in an ad hoc approach based on hydrophobicity changes. | ||||
|   aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") | ||||
|   aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") | ||||
|   aaNeutral <- c("A", "H", "T", "S", "V", "G") | ||||
|  | ||||
|   penalties <- numeric(length(nat)) | ||||
|   lMut <- nat != mut    # logical TRUE for all mutated positions | ||||
|  | ||||
|   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1 | ||||
|   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0 | ||||
|   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3 | ||||
|  | ||||
|   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0 | ||||
|   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1 | ||||
|   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3 | ||||
|  | ||||
|   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3 | ||||
|   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3 | ||||
|   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1 | ||||
|  | ||||
|   return(sum(penalties)) | ||||
| } | ||||
|  | ||||
| # A more sophisticated approach could take additional quantities into account, | ||||
| # such as charge, size, or flexibility - and it could add heuristics, such as: | ||||
| # proline is always bad in secondary structure, charged amino acids are terrible | ||||
| # in the folded core of a protein, replacing a small by a large amino acid in | ||||
| # the core is very disruptive ... etc. | ||||
| # | ||||
| # For our experiment, we should not  use a mutation data matrix however: | ||||
| # empirical mutation probabilities are superbly suited to estimate evolutionary | ||||
| # relationships. Here however, as we are trying to evaluate effects of random | ||||
| # mutations on genetic codes, our reasoning would be circular - we would | ||||
| # discover that the natural genetic code is optimal ... because it is most | ||||
| # similar to the natural genetic code. That would be Cargo Cult bioinformatics. | ||||
|  | ||||
|  | ||||
| # =    3  Run the experiment  ================================================== | ||||
|  | ||||
| # Fetch the standard Genetic code from Biostrings:: | ||||
|  | ||||
| stdCode <- Biostrings::GENETIC_CODE | ||||
|  | ||||
| # Fetch the nucleotide sequence for MBP1: | ||||
|  | ||||
| myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1] | ||||
| myDNA <- paste0(myDNA, collapse = "") | ||||
| myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA))) | ||||
| myDNA <- myDNA[-length(myDNA)]  # drop the stop codon | ||||
|  | ||||
| myAA <- traFor(myDNA, stdCode) | ||||
|  | ||||
| # Mutate and evaluate | ||||
| set.seed(112358) | ||||
| x <- randMut(myDNA) | ||||
| set.seed(NULL) | ||||
| x <- traFor(x, stdCode) | ||||
| evalMut(myAA, x)  # 166.4 | ||||
|  | ||||
| # Try this 200 times, and see how the values are distributed. | ||||
| N <- 200 | ||||
| valSTDC <- numeric(N) | ||||
|  | ||||
| set.seed(112358)                   # set RNG seed for repeatable randomness | ||||
| for (i in 1:N) {                   # this takes a few seconds ... | ||||
|   x <- randMut(myDNA)              # mutate | ||||
|   x <- traFor(x, stdCode)     # translate | ||||
|   valSTDC[i] <- evalMut(myAA, x)    # evaluate | ||||
| } | ||||
| set.seed(NULL)                     # reset the RNG | ||||
|  | ||||
| hist(valSTDC, | ||||
|      breaks = 15, | ||||
|      col = "palegoldenrod", | ||||
|      xlim = c(0, 400), | ||||
|      ylim = c(0, N/4), | ||||
|      main = "Standard vs. Synthetic Genetic Code", | ||||
|      xlab = "Mutation penalty") | ||||
|  | ||||
| # This looks like a normal distribution. Let's assume the effect of mutations | ||||
| # under the standard genetic code is the mean of this distribution: | ||||
| effectSTDC <- mean(valSTDC)  # 178.1 | ||||
|  | ||||
| # Now we can look at the effects of alternate genetic codes: | ||||
|  | ||||
| set.seed(112358) | ||||
| # choose a new code | ||||
| GC <- randomGC(stdCode) | ||||
| set.seed(NULL) | ||||
|  | ||||
| # reverse translate hypothetical sequence according to the new code | ||||
| x <- traRev(myAA, GC) | ||||
|  | ||||
| x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence | ||||
| x <- traFor(x, GC)     # translate back, with the new code | ||||
| evalMut(myAA, x)       # evaluate mutation effects: 298.5 | ||||
|  | ||||
| # That seems a fair bit higher than what we saw as "effectUGC" | ||||
| # Let's try with different genetic codes. 200 trials - but this time every trial | ||||
| # is with a different, synthetic genetic code. | ||||
|  | ||||
| N <- 200 | ||||
| valXGC <- numeric(N) | ||||
|  | ||||
| set.seed(1414214)                # set RNG seed for repeatable randomness | ||||
| for (i in 1:N) { | ||||
|   GC <- randomGC(stdCode)   # Choose code | ||||
|   x <- traRev(myAA, GC)          # reverse translate | ||||
|   x <- randMut(x)                # mutate | ||||
|   x <- traFor(x, GC)             # translate | ||||
|   valXGC[i] <- evalMut(myAA, x)  # evaluate | ||||
| } | ||||
| set.seed(NULL)                   # reset the RNG | ||||
|  | ||||
| hist(valXGC, | ||||
|      col = "plum", | ||||
|      breaks = 15, | ||||
|      add = TRUE) | ||||
|  | ||||
| # These two distributions are very widely separated! | ||||
|  | ||||
| # Task: Perform the same experiment with the swapped genetic code. | ||||
| #       Compare the distributions. Interpret the result. | ||||
|  | ||||
|  | ||||
| # These are simple experiments, under assumptions that can be refined in | ||||
| # meaningful ways. Yet, even those simple computational experiments show | ||||
| # that the Universal Genetic Code has features that one would predict if | ||||
| # it has evolved under selective pressure to minimize the effects of mutations. | ||||
| # Gradual change under mutation is benificial to evolution, disruptive | ||||
| # change is not. | ||||
|  | ||||
|  | ||||
| # =    4  Task solutions  ====================================================== | ||||
|  | ||||
| N <- 200 | ||||
| valSGC <- numeric(N) | ||||
|  | ||||
| set.seed(2718282)                # set RNG seed for repeatable randomness | ||||
| for (i in 1:N) { | ||||
|   GC <- swappedGC(stdCode)  # Choose code | ||||
|   x <- traRev(myAA, GC)          # reverse translate | ||||
|   x <- randMut(x)                # mutate | ||||
|   x <- traFor(x, GC)             # translate | ||||
|   valSGC[i] <- evalMut(myAA, x)  # evaluate | ||||
| } | ||||
| set.seed(NULL)                   # reset the RNG | ||||
|  | ||||
| hist(valSGC, | ||||
|      col = "#6688FF88", | ||||
|      breaks = 15, | ||||
|      add = TRUE) | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-Genetic_code_optimality.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Genetic_code_optimality unit. | ||||
| # | ||||
| # Version:  1.3 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.3    2020 Maintenance | ||||
| #           1.2    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #                      use Biocmanager:: not biocLite() | ||||
| #           1.1      Update set.seed() usage | ||||
| #           1.0.1    Fixed two bugs discovered by Suan Chin Yeo. | ||||
| #           1.0      New material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                          Line | ||||
| #TOC> -------------------------------------------------------------- | ||||
| #TOC>   1        Designing a computational experiment             58 | ||||
| #TOC>   2        Setting up the tools                             74 | ||||
| #TOC>   2.1        Natural and alternative genetic codes          77 | ||||
| #TOC>   2.2        Effect of mutations                           135 | ||||
| #TOC>   2.2.1          reverse-translate                         146 | ||||
| #TOC>   2.2.2          Randomly mutate                           171 | ||||
| #TOC>   2.2.3          Forward- translate                        196 | ||||
| #TOC>   2.2.4          measure effect                            213 | ||||
| #TOC>   3        Run the experiment                              267 | ||||
| #TOC>   4        Task solutions                                  363 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # This unit demonstrates R code to simulate alternate genetic codes and evaluate | ||||
| # their robsustness to code changes. The approaches are quite simple and you | ||||
| # will be able to come up with obvious refinements; the point of this code is to | ||||
| # demonstrate some R programming techniques, in preparation for more | ||||
| # sophisticated questions later. | ||||
|  | ||||
|  | ||||
| # =    1  Designing a computational experiment  ================================ | ||||
|  | ||||
| # Computational experiments are conducted like wet-lab experiments. We begin | ||||
| # with a hypothesis, then define the observables that relate to the hypothesis, | ||||
| # then define the measures we apply to observations, and finally we interpret | ||||
| # our observations. If we want to learn something about the evolution of the | ||||
| # genetic code ... | ||||
|  | ||||
| #  - we construct a hypothesis such as: the genetic code has evolved so as to | ||||
| #      minimize the effect of mutations; | ||||
| #  - we define the observables: the effect of mutations in | ||||
| #      sequences, given the natural and possible alternative codes; | ||||
| #  - we define the measures to quantify the effect of mutations; | ||||
| #  - then we compute alternatives and interpret the results. | ||||
|  | ||||
|  | ||||
| # =    2  Setting up the tools  ================================================ | ||||
|  | ||||
|  | ||||
| # ==   2.1  Natural and alternative genetic codes  ============================= | ||||
|  | ||||
| # Load genetic code tables from the Biostrings package | ||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||
|   install.packages("BiocManager") | ||||
| } | ||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||
|   BiocManager::install("Biostrings") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = Biostrings)       # basic information | ||||
| #  browseVignettes("Biostrings")    # available vignettes | ||||
| #  data(package = "Biostrings")     # available datasets | ||||
|  | ||||
|  | ||||
| # There are many ways to generate alternative codes. The simplest way is to | ||||
| # randomly assign amino acids to codons. A more sophisticated way is to keep the | ||||
| # redundancy of codons intact, since it may reflect some form of symmetry | ||||
| # breaking that ignores the third nucleotide of a codon for the most part; | ||||
| # therefore we only replace the amino acids of the existing code with random | ||||
| # others. Here are two functions that implement these two ideas about alternate | ||||
| # codes. | ||||
|  | ||||
| randomGC <- function(GC) { | ||||
|   # Return a genetic code with randomly assigned amino acids. | ||||
|   # Parameters: | ||||
|   #    GC   named chr  length-64 character vector of 20 amino acid one-letter | ||||
|   #                       codes plus "*" (stop), named with the codon triplet. | ||||
|   # Value:  named chr  same vector with random amino acid assignments in which | ||||
|   #                       every amino acid and "*" is encoded at least once. | ||||
|  | ||||
|   aa <- unique(GC)                           # the amino acids in the input code | ||||
|   GC[1:64] <- sample(aa, 64, replace = TRUE) # random code | ||||
|   while(length(unique(GC)) < length(aa)) {   # We could end up with a code that | ||||
|                                              # does not contain all amino acids, | ||||
|                                              # then we sample() again. | ||||
|     GC[1:64] <- sample(aa, 64, replace = TRUE) | ||||
|   } | ||||
|   return(GC) | ||||
| } | ||||
|  | ||||
| swappedGC <- function(GC) { | ||||
|   # Return a genetic code with randomly swapped amino acids. | ||||
|   # Parameters: | ||||
|   #    GC   named chr  length-64 character vector of 20 amino acid one-letter | ||||
|   #                       codes plus "*" (stop), named with the codon triplet. | ||||
|   # Value:  named chr  same vector with random amino acid assignments where the | ||||
|   #                       amino acids have been swapped. | ||||
|  | ||||
|   aaOrig <- unique(GC)                       # the amino acids in the input code | ||||
|   aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled | ||||
|   names(aaSwap) <- aaOrig                    # name them after the original | ||||
|   GC[1:64] <- aaSwap[GC]                     # replace original with shuffled | ||||
|  | ||||
|   return(GC) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ==   2.2  Effect of mutations  =============================================== | ||||
|  | ||||
|  | ||||
| # To evaluate the effects of mutations we will do the following: | ||||
| #   - we take an amino acid sequence (Mbp1 will do just nicely); | ||||
| #   - we reverse-translate it into a nucleotide sequence; | ||||
| #   - we mutate it randomly; | ||||
| #   - we translate it back to amino acids; | ||||
| #   - we count the number of mutations and evaluate their severity. | ||||
|  | ||||
|  | ||||
| # ===   2.2.1  reverse-translate                     | ||||
|  | ||||
| # To reverse-translate an amino acid vector, we randomly pick one of its | ||||
| # codons from a genetic code, and assemble all codons to a sequence. | ||||
|  | ||||
| traRev <- function(s, GC) { | ||||
|   # Parameters: | ||||
|   #      s   chr   a sequence vector | ||||
|   #      GC  chr   a genetic code | ||||
|   # Value: | ||||
|   #      A reverse-translated vector of codons | ||||
|   vC <- character(length(s)) | ||||
|  | ||||
|   for (i in seq_along(s)) { | ||||
|     codon <- names(GC)[GC == s[i]]   # get all codons for this AA | ||||
|     if (length(codon) > 1) {         # if there's more than one ... | ||||
|       codon <- sample(codon, 1)      # pick one at random ... | ||||
|     } | ||||
|     vC[i] <- codon                   # store it | ||||
|   } | ||||
|  | ||||
|   return(vC) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ===   2.2.2  Randomly mutate                       | ||||
|  | ||||
| # To mutate, we split a codon into it's three nucleotides, then randomly replace | ||||
| # one of the three with another nucleotide. | ||||
|  | ||||
| randMut <- function(vC) { | ||||
|   # Parameter: | ||||
|   #    vC   chr     a vector of codons | ||||
|   # Value:  chr     a vector of codons with a single point mutation from vC | ||||
|  | ||||
|   nuc <- c("A", "C", "G", "T") | ||||
|  | ||||
|   for (i in seq_along(vC)) { | ||||
|     triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl. | ||||
|     iNuc <- sample(1:3, 1)                         # choose one of the three | ||||
|     mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide | ||||
|     triplet[iNuc] <- mutNuc                        # replace the original | ||||
|     vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon | ||||
|   } | ||||
|   return(vC) | ||||
|  | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # ===   2.2.3  Forward- translate                    | ||||
|  | ||||
| traFor <- function(vC, GC) { | ||||
|   # Parameters: | ||||
|   #      vC   chr   a codon vector | ||||
|   #      GC   chr   a genetic code | ||||
|   # Value: | ||||
|   #      A vector of amino acids | ||||
|   vAA <- character(length(vC)) | ||||
|  | ||||
|   for (i in seq_along(vC)) { | ||||
|     vAA[i] <- GC[vC[i]]         # translate and store | ||||
|   } | ||||
|   return(vAA) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ===   2.2.4  measure effect                        | ||||
|  | ||||
| # How do we evaluate the effect of the mutation? We'll take a simple ad hoc | ||||
| # approach: we divide amino acids into hydrophobic, hydrophilic, and neutral | ||||
| # categories, according to their free energy of transfer from water to octanol: | ||||
| aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") | ||||
| aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") | ||||
| aaNeutral <- c("A", "H", "T", "S", "V", "G") | ||||
|  | ||||
| # Then we will penalize as follows: | ||||
| # Changes within one category: 0.1 | ||||
| # Changes from hydrophobic or hydrophilic to neutral or back: 0.3 | ||||
| # Changes from hydrophobic to hydrophilic or back: 1.0 | ||||
| # Changes to stop-codon: 3.0 | ||||
|  | ||||
| evalMut <- function(nat, mut) { | ||||
|   # Evaluate severity of mutations between amino acid sequence vectors nat and | ||||
|   # mut in an ad hoc approach based on hydrophobicity changes. | ||||
|   aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") | ||||
|   aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") | ||||
|   aaNeutral <- c("A", "H", "T", "S", "V", "G") | ||||
|  | ||||
|   penalties <- numeric(length(nat)) | ||||
|   lMut <- nat != mut    # logical TRUE for all mutated positions | ||||
|  | ||||
|   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1 | ||||
|   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0 | ||||
|   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3 | ||||
|  | ||||
|   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0 | ||||
|   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1 | ||||
|   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3 | ||||
|  | ||||
|   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3 | ||||
|   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3 | ||||
|   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1 | ||||
|  | ||||
|   return(sum(penalties)) | ||||
| } | ||||
|  | ||||
| # A more sophisticated approach could take additional quantities into account, | ||||
| # such as charge, size, or flexibility - and it could add heuristics, such as: | ||||
| # proline is always bad in secondary structure, charged amino acids are terrible | ||||
| # in the folded core of a protein, replacing a small by a large amino acid in | ||||
| # the core is very disruptive ... etc. | ||||
| # | ||||
| # For our experiment, we should not  use a mutation data matrix however: | ||||
| # empirical mutation probabilities are superbly suited to estimate evolutionary | ||||
| # relationships. Here however, as we are trying to evaluate effects of random | ||||
| # mutations on genetic codes, our reasoning would be circular - we would | ||||
| # discover that the natural genetic code is optimal ... because it is most | ||||
| # similar to the natural genetic code. That would be Cargo Cult bioinformatics. | ||||
|  | ||||
|  | ||||
| # =    3  Run the experiment  ================================================== | ||||
|  | ||||
| # Fetch the standard Genetic code from Biostrings:: | ||||
|  | ||||
| stdCode <- Biostrings::GENETIC_CODE | ||||
|  | ||||
| # Fetch the nucleotide sequence for MBP1: | ||||
|  | ||||
| myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1] | ||||
| myDNA <- paste0(myDNA, collapse = "") | ||||
| myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA))) | ||||
| myDNA <- myDNA[-length(myDNA)]  # drop the stop codon | ||||
|  | ||||
| myAA <- traFor(myDNA, stdCode) | ||||
|  | ||||
| # Mutate and evaluate | ||||
| set.seed(112358) | ||||
| x <- randMut(myDNA) | ||||
| set.seed(NULL) | ||||
| x <- traFor(x, stdCode) | ||||
| evalMut(myAA, x)  # 166.4 | ||||
|  | ||||
| # Try this 200 times, and see how the values are distributed. | ||||
| N <- 200 | ||||
| valSTDC <- numeric(N) | ||||
|  | ||||
| set.seed(112358)                   # set RNG seed for repeatable randomness | ||||
| for (i in 1:N) {                   # this takes a few seconds ... | ||||
|   x <- randMut(myDNA)              # mutate | ||||
|   x <- traFor(x, stdCode)     # translate | ||||
|   valSTDC[i] <- evalMut(myAA, x)    # evaluate | ||||
| } | ||||
| set.seed(NULL)                     # reset the RNG | ||||
|  | ||||
| hist(valSTDC, | ||||
|      breaks = 15, | ||||
|      col = "palegoldenrod", | ||||
|      xlim = c(0, 400), | ||||
|      ylim = c(0, N/4), | ||||
|      main = "Standard vs. Synthetic Genetic Code", | ||||
|      xlab = "Mutation penalty") | ||||
|  | ||||
| # This looks like a normal distribution. Let's assume the effect of mutations | ||||
| # under the standard genetic code is the mean of this distribution: | ||||
| effectSTDC <- mean(valSTDC)  # 178.1 | ||||
|  | ||||
| # Now we can look at the effects of alternate genetic codes: | ||||
|  | ||||
| set.seed(112358) | ||||
| # choose a new code | ||||
| GC <- randomGC(stdCode) | ||||
| set.seed(NULL) | ||||
|  | ||||
| # reverse translate hypothetical sequence according to the new code | ||||
| x <- traRev(myAA, GC) | ||||
|  | ||||
| x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence | ||||
| x <- traFor(x, GC)     # translate back, with the new code | ||||
| evalMut(myAA, x)       # evaluate mutation effects: 298.5 | ||||
|  | ||||
| # That seems a fair bit higher than what we saw as "effectUGC" | ||||
| # Let's try with different genetic codes. 200 trials - but this time every trial | ||||
| # is with a different, synthetic genetic code. | ||||
|  | ||||
| N <- 200 | ||||
| valXGC <- numeric(N) | ||||
|  | ||||
| set.seed(1414214)                # set RNG seed for repeatable randomness | ||||
| for (i in 1:N) { | ||||
|   GC <- randomGC(stdCode)   # Choose code | ||||
|   x <- traRev(myAA, GC)          # reverse translate | ||||
|   x <- randMut(x)                # mutate | ||||
|   x <- traFor(x, GC)             # translate | ||||
|   valXGC[i] <- evalMut(myAA, x)  # evaluate | ||||
| } | ||||
| set.seed(NULL)                   # reset the RNG | ||||
|  | ||||
| hist(valXGC, | ||||
|      col = "plum", | ||||
|      breaks = 15, | ||||
|      add = TRUE) | ||||
|  | ||||
| # These two distributions are very widely separated! | ||||
|  | ||||
| # Task: Perform the same experiment with the swapped genetic code. | ||||
| #       Compare the distributions. Interpret the result. | ||||
|  | ||||
|  | ||||
| # These are simple experiments, under assumptions that can be refined in | ||||
| # meaningful ways. Yet, even those simple computational experiments show | ||||
| # that the Universal Genetic Code has features that one would predict if | ||||
| # it has evolved under selective pressure to minimize the effects of mutations. | ||||
| # Gradual change under mutation is benificial to evolution, disruptive | ||||
| # change is not. | ||||
|  | ||||
|  | ||||
| # =    4  Task solutions  ====================================================== | ||||
|  | ||||
| N <- 200 | ||||
| valSGC <- numeric(N) | ||||
|  | ||||
| set.seed(2718282)                # set RNG seed for repeatable randomness | ||||
| for (i in 1:N) { | ||||
|   GC <- swappedGC(stdCode)  # Choose code | ||||
|   x <- traRev(myAA, GC)          # reverse translate | ||||
|   x <- randMut(x)                # mutate | ||||
|   x <- traFor(x, GC)             # translate | ||||
|   valSGC[i] <- evalMut(myAA, x)  # evaluate | ||||
| } | ||||
| set.seed(NULL)                   # reset the RNG | ||||
|  | ||||
| hist(valSGC, | ||||
|      col = "#6688FF88", | ||||
|      breaks = 15, | ||||
|      add = TRUE) | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,50 +1,50 @@ | ||||
| # tocID <- "RPR-Introduction.R" | ||||
| # | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Introduction unit | ||||
| # | ||||
| # Version: 1.0 | ||||
| # | ||||
| # Date:    2020-09-18 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # V 1.0    Updtaed workflow; live | ||||
| # V 0.1    First code | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # DO NOT SIMPLY  source()  THESE FILES! | ||||
|  | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| # === TASK: Local script | ||||
| # | ||||
| # - Open the file myScript.R | ||||
| # | ||||
| # - Create a section header with a date. | ||||
| # - Enter an R-expression that will produce the first 11 powers of 2 (starting | ||||
| #     from 0). Not a loop - a single expression. The first number you get must | ||||
| #     be 1. The last number you get must be 1024. | ||||
| # | ||||
| # - Save the file in the myScripts folder, and close it. | ||||
| # | ||||
| # - Open the file again, select the expression and type Cmd+Enter (or Cmd+R) | ||||
| #   to execute it. | ||||
| # | ||||
| # - Done | ||||
|  | ||||
| # (This task is meant  to make sure that writing R expressions, saving | ||||
| #  them in scripts, opening script files and executing code in the file works | ||||
| #  for you. If there is an issue, get in touch.) | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-Introduction.R" | ||||
| # | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Introduction unit | ||||
| # | ||||
| # Version: 1.0 | ||||
| # | ||||
| # Date:    2020-09-18 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # V 1.0    Updtaed workflow; live | ||||
| # V 0.1    First code | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # DO NOT SIMPLY  source()  THESE FILES! | ||||
|  | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| # === TASK: Local script | ||||
| # | ||||
| # - Open the file myScript.R | ||||
| # | ||||
| # - Create a section header with a date. | ||||
| # - Enter an R-expression that will produce the first 11 powers of 2 (starting | ||||
| #     from 0). Not a loop - a single expression. The first number you get must | ||||
| #     be 1. The last number you get must be 1024. | ||||
| # | ||||
| # - Save the file in the myScripts folder, and close it. | ||||
| # | ||||
| # - Open the file again, select the expression and type Cmd+Enter (or Cmd+R) | ||||
| #   to execute it. | ||||
| # | ||||
| # - Done | ||||
|  | ||||
| # (This task is meant  to make sure that writing R expressions, saving | ||||
| #  them in scripts, opening script files and executing code in the file works | ||||
| #  for you. If there is an issue, get in touch.) | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,168 +1,168 @@ | ||||
| # tocID <- "RPR-PROSITE_POST.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #           1.0.1  Updates for slightly changed interfaces | ||||
| #           1.0    First ABC units version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                                 Line | ||||
| #TOC> --------------------------------------------------------------------- | ||||
| #TOC>   1        Constructing a POST command from a Web query            43 | ||||
| #TOC>   1.1        Task - fetchPrositeFeatures() function               148 | ||||
| #TOC>   2        Task solutions                                         156 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Constructing a POST command from a Web query  ======================== | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = httr)       # basic information | ||||
| #  browseVignettes("httr")    # available vignettes | ||||
| #  data(package = "httr")     # available datasets | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # We have reverse engineered the Web form for a ScanProsite request, and can | ||||
| # construct a valid POST request from knowing the required field names. The POST | ||||
| # command is similar to GET(), but we need an explicit request body that | ||||
| # contains a list of key/value pairs | ||||
|  | ||||
| UniProtID <- "P39678" | ||||
|  | ||||
| URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" | ||||
|  | ||||
| response <- httr::POST(URL, | ||||
|                        body = list(meta = "opt1", | ||||
|                                    meta1_protein = "opt1", | ||||
|                                    seq = UniProtID, | ||||
|                                    skip = "on", | ||||
|                                    output = "tabular")) | ||||
|  | ||||
| # Send off this request, and you should have a response in a few | ||||
| # seconds. Let's check the status first: | ||||
|  | ||||
| httr::status_code(response)  # If this is not 200, something went wrong and it | ||||
|                              # makes no sense to continue. If this persists, ask | ||||
|                              # on the Discussion Board what to do. | ||||
|  | ||||
|  | ||||
| # The text contents of the response is available with the | ||||
| # content() function: | ||||
| httr::content(response, "text") | ||||
|  | ||||
| # ... should show you the same as the page contents that you have seen in the | ||||
| # browser. Now we need to extract the data from the page. For this simple | ||||
| # example we can get away with using regular expressions, but in general we need | ||||
| # a real XML parser to parse HTML. We'll cover that in a later unit. Here, we | ||||
| # strsplit() the response into individual lines, since each of our data elements | ||||
| # is on its own line, and then capture the contents. The way Prosite has | ||||
| # formatted their HTML we can simply split on the "\\n" newline character - but | ||||
| # they could write the same valid HTML without any newline-characters at all. | ||||
| # Understand that we are working with a bit of a "hack" here: exploting | ||||
| # empirical assumptions rather than a formal specification. But sometimes quick | ||||
| # and dirty is fine, because quick. | ||||
|  | ||||
| lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) | ||||
| head(lines) | ||||
|  | ||||
| # Now we define a query pattern for the lines we want: | ||||
| # we can use the uID, bracketed by two "|" pipe | ||||
| # characters: | ||||
|  | ||||
| patt <- sprintf("\\|%s\\|", UniProtID) | ||||
|  | ||||
| # ... and select only the lines that match this | ||||
| # pattern: | ||||
|  | ||||
| ( lines <- lines[grep(patt, lines)] ) | ||||
|  | ||||
| # ... captures the three lines of output. | ||||
|  | ||||
| # Now we break the lines apart into tokens: this is another application of | ||||
| # strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs | ||||
| # "\t". Look at the regex "\\t|\\|" in the strsplit() call: | ||||
|  | ||||
| unlist(strsplit(lines[1], "\\t|\\|")) | ||||
|  | ||||
| # Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped | ||||
| # with a backslash. "t" has to be escaped because we want to match a tab (\t), | ||||
| # not the literal character "t". And "|" has to be escaped because we mean the | ||||
| # literal pipe character, not its metacharacter meaning OR. Thus sometimes the | ||||
| # backslash turns a special meaning off, and sometimes it turns a special | ||||
| # meaning on. Unfortunately there's no easy way to tell - you just need to | ||||
| # remember the characters - or have a reference handy. The metacharacters are | ||||
| # (){}[]^$?*+.|&-   ... and some of them have different meanings depending on | ||||
| # where in the regex they are. | ||||
|  | ||||
| # Let's put the tokens into named slots of a data frame | ||||
|  | ||||
| features <- data.frame() | ||||
| for (line in lines) { | ||||
|   tokens <- unlist(strsplit(line, "\\t|\\|")) | ||||
|   features <- rbind(features, | ||||
|                     data.frame(uID   =  tokens[2], | ||||
|                                start =  as.numeric(tokens[4]), | ||||
|                                end   =  as.numeric(tokens[5]), | ||||
|                                psID  =  tokens[6], | ||||
|                                psName = tokens[7], | ||||
|                                psSeq  = tokens[11])) | ||||
| } | ||||
| features | ||||
|  | ||||
| #  This forms the base of a function that collects the features automatically | ||||
| #  from a PrositeScan result. You can write this! | ||||
|  | ||||
|  | ||||
| # ==   1.1  Task - fetchPrositeFeatures() function  ============================ | ||||
|  | ||||
|  | ||||
| # Task: write a function that takes as input a UniProt ID, fetches the | ||||
| # features it contains from ScanProsite and returns a data frame as given above, or | ||||
| # an empty data frame if there is an error. | ||||
|  | ||||
|  | ||||
| # =    2  Task solutions  ====================================================== | ||||
|  | ||||
|  | ||||
| # I have placed such a function into the ABC-dbUtilities.R script: look it up by | ||||
| # clicking on  dbFetchPrositeFeatures() in the Environment pane. | ||||
|  | ||||
| # Test: | ||||
| dbFetchPrositeFeatures("Q5KMQ9") | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-PROSITE_POST.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout, | ||||
| #           1.0.1  Updates for slightly changed interfaces | ||||
| #           1.0    First ABC units version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                                 Line | ||||
| #TOC> --------------------------------------------------------------------- | ||||
| #TOC>   1        Constructing a POST command from a Web query            43 | ||||
| #TOC>   1.1        Task - fetchPrositeFeatures() function               148 | ||||
| #TOC>   2        Task solutions                                         156 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Constructing a POST command from a Web query  ======================== | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = httr)       # basic information | ||||
| #  browseVignettes("httr")    # available vignettes | ||||
| #  data(package = "httr")     # available datasets | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # We have reverse engineered the Web form for a ScanProsite request, and can | ||||
| # construct a valid POST request from knowing the required field names. The POST | ||||
| # command is similar to GET(), but we need an explicit request body that | ||||
| # contains a list of key/value pairs | ||||
|  | ||||
| UniProtID <- "P39678" | ||||
|  | ||||
| URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" | ||||
|  | ||||
| response <- httr::POST(URL, | ||||
|                        body = list(meta = "opt1", | ||||
|                                    meta1_protein = "opt1", | ||||
|                                    seq = UniProtID, | ||||
|                                    skip = "on", | ||||
|                                    output = "tabular")) | ||||
|  | ||||
| # Send off this request, and you should have a response in a few | ||||
| # seconds. Let's check the status first: | ||||
|  | ||||
| httr::status_code(response)  # If this is not 200, something went wrong and it | ||||
|                              # makes no sense to continue. If this persists, ask | ||||
|                              # on the Discussion Board what to do. | ||||
|  | ||||
|  | ||||
| # The text contents of the response is available with the | ||||
| # content() function: | ||||
| httr::content(response, "text") | ||||
|  | ||||
| # ... should show you the same as the page contents that you have seen in the | ||||
| # browser. Now we need to extract the data from the page. For this simple | ||||
| # example we can get away with using regular expressions, but in general we need | ||||
| # a real XML parser to parse HTML. We'll cover that in a later unit. Here, we | ||||
| # strsplit() the response into individual lines, since each of our data elements | ||||
| # is on its own line, and then capture the contents. The way Prosite has | ||||
| # formatted their HTML we can simply split on the "\\n" newline character - but | ||||
| # they could write the same valid HTML without any newline-characters at all. | ||||
| # Understand that we are working with a bit of a "hack" here: exploting | ||||
| # empirical assumptions rather than a formal specification. But sometimes quick | ||||
| # and dirty is fine, because quick. | ||||
|  | ||||
| lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) | ||||
| head(lines) | ||||
|  | ||||
| # Now we define a query pattern for the lines we want: | ||||
| # we can use the uID, bracketed by two "|" pipe | ||||
| # characters: | ||||
|  | ||||
| patt <- sprintf("\\|%s\\|", UniProtID) | ||||
|  | ||||
| # ... and select only the lines that match this | ||||
| # pattern: | ||||
|  | ||||
| ( lines <- lines[grep(patt, lines)] ) | ||||
|  | ||||
| # ... captures the three lines of output. | ||||
|  | ||||
| # Now we break the lines apart into tokens: this is another application of | ||||
| # strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs | ||||
| # "\t". Look at the regex "\\t|\\|" in the strsplit() call: | ||||
|  | ||||
| unlist(strsplit(lines[1], "\\t|\\|")) | ||||
|  | ||||
| # Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped | ||||
| # with a backslash. "t" has to be escaped because we want to match a tab (\t), | ||||
| # not the literal character "t". And "|" has to be escaped because we mean the | ||||
| # literal pipe character, not its metacharacter meaning OR. Thus sometimes the | ||||
| # backslash turns a special meaning off, and sometimes it turns a special | ||||
| # meaning on. Unfortunately there's no easy way to tell - you just need to | ||||
| # remember the characters - or have a reference handy. The metacharacters are | ||||
| # (){}[]^$?*+.|&-   ... and some of them have different meanings depending on | ||||
| # where in the regex they are. | ||||
|  | ||||
| # Let's put the tokens into named slots of a data frame | ||||
|  | ||||
| features <- data.frame() | ||||
| for (line in lines) { | ||||
|   tokens <- unlist(strsplit(line, "\\t|\\|")) | ||||
|   features <- rbind(features, | ||||
|                     data.frame(uID   =  tokens[2], | ||||
|                                start =  as.numeric(tokens[4]), | ||||
|                                end   =  as.numeric(tokens[5]), | ||||
|                                psID  =  tokens[6], | ||||
|                                psName = tokens[7], | ||||
|                                psSeq  = tokens[11])) | ||||
| } | ||||
| features | ||||
|  | ||||
| #  This forms the base of a function that collects the features automatically | ||||
| #  from a PrositeScan result. You can write this! | ||||
|  | ||||
|  | ||||
| # ==   1.1  Task - fetchPrositeFeatures() function  ============================ | ||||
|  | ||||
|  | ||||
| # Task: write a function that takes as input a UniProt ID, fetches the | ||||
| # features it contains from ScanProsite and returns a data frame as given above, or | ||||
| # an empty data frame if there is an error. | ||||
|  | ||||
|  | ||||
| # =    2  Task solutions  ====================================================== | ||||
|  | ||||
|  | ||||
| # I have placed such a function into the ABC-dbUtilities.R script: look it up by | ||||
| # clicking on  dbFetchPrositeFeatures() in the Environment pane. | ||||
|  | ||||
| # Test: | ||||
| dbFetchPrositeFeatures("Q5KMQ9") | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										270
									
								
								RPR-Pipe.R
									
									
									
									
									
								
							
							
						
						
									
										270
									
								
								RPR-Pipe.R
									
									
									
									
									
								
							| @@ -1,135 +1,135 @@ | ||||
| # tocID <- "RPR-Pipe.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              Discussing pipe operators. | ||||
| # | ||||
| # Version:  1.0 | ||||
| # | ||||
| # Date:     2021  10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.0    New code | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #   - find more interesting examples | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                            Line | ||||
| #TOC> ------------------------------------------------ | ||||
| #TOC>   1        Pipe  Concept                      41 | ||||
| #TOC>   2        Nested Expression                  73 | ||||
| #TOC>   3        magrittr:: Pipe                    78 | ||||
| #TOC>   4        Base R Pipe                        93 | ||||
| #TOC>   5        Intermediate Assignment           108 | ||||
| #TOC>   6        Postscript                        127 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Pipe  Concept  ======================================================= | ||||
|  | ||||
| # Pipes are actually an awesome idea for any code that implements a workflow - | ||||
| # a sequence of operations, each of which transforms data in a specialized way. | ||||
| # | ||||
| # This principle is familiar from maths: chained functions. If have a function | ||||
| # y = f(x) and want to use those results as in z = g(y), I can just write | ||||
| # z = g(f(x)) | ||||
| # | ||||
| # On the unix command line, pipes were used from the very beginning, implemented | ||||
| # with the "|" pipe character. | ||||
| # | ||||
| # In R, the magrittr package provided the %>% operator, and recently the |> | ||||
| # operator has been introduced into base R. | ||||
| # | ||||
| # However there are alternatives: intermediate assignment, and nested functions | ||||
| # that have always existed in base R anyway. | ||||
| # | ||||
| # Let us look at an example. In writing this, I found out that virtually | ||||
| # ALL non-trivial examples I came up with don't translate well into this idiom | ||||
| # at all. It is actually quite limited to simple filtering operations on | ||||
| # data. A more interesting example might be added in the future, let me know if | ||||
| # you have a good idea. | ||||
| # | ||||
| # A somewhat contrived example is to sort a list of files by the | ||||
| # length of the file names: | ||||
|  | ||||
| myFiles <- list.files(pattern = "\\.R$") | ||||
|  | ||||
| # nchar() gives the number of characters in a string, order() produces indices | ||||
| # that map an array to its sorted form. | ||||
| # | ||||
| # =    2  Nested Expression  =================================================== | ||||
|  | ||||
| myFiles[order(nchar(myFiles))] | ||||
|  | ||||
|  | ||||
| # =    3  magrittr:: Pipe  ===================================================== | ||||
|  | ||||
| if (! requireNamespace("magrittr", quietly = TRUE)) { | ||||
|   install.packages("magrittr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = magrittr)       # basic information | ||||
| #  browseVignettes("magrittr")    # available vignettes | ||||
| #  data(package = "magrittr")     # available datasets | ||||
|  | ||||
|  | ||||
| library(magrittr) | ||||
|  | ||||
| myFiles  %>% nchar %>% order %>% myFiles[.] | ||||
|  | ||||
| # =    4  Base R Pipe  ========================================================= | ||||
|  | ||||
| # Since version 4.1, base R now supports a pipe operator without the need | ||||
| # to load a special package. Such an introductions of external functionality | ||||
| # into the language is very rare. | ||||
| # | ||||
| # Unfortunately it won't (yet) work with the '[' function, so we need to write | ||||
| # an intermediate function for this example | ||||
| extract <- function(x, v) { | ||||
|   return(v[x]) | ||||
| } | ||||
|  | ||||
| myFiles |> nchar() |> order() |> extract(myFiles) | ||||
|  | ||||
|  | ||||
| # =    5  Intermediate Assignment  ============================================= | ||||
|  | ||||
| # So what's the problem? As you can see, the piped code may be concise and | ||||
| # expressive. But there is also a large amount of implicit assignment and | ||||
| # processing going on and that is usually a bad idea because it makes code hard | ||||
| # to maintain. I am NOT a big fan of the nested syntax, but I don't think that | ||||
| # replacing it with the pipe makes things much better. My preferred idiom is | ||||
| # to use intermediate assignments. Only then is it convenient to examine | ||||
| # the code step by step and validate every single step. And that is the most | ||||
| # important objective at all: no code is good if it does not compute | ||||
| # correctly. | ||||
|  | ||||
|  | ||||
| x <- nchar(myFiles) | ||||
| x <- order(x) | ||||
| myFiles[x] | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    6  Postscript  ========================================================== | ||||
|  | ||||
| # I tried to write an example that strips all comments from a list of files, and | ||||
| # another example that finds all files that were not yet updated this year | ||||
| # (according to the "# Date: in the header). Neither examples can be well | ||||
| # written without intermediate assignments, or at least sapply() functions | ||||
| # that are not simpler at all than the intermediate assignment. | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-Pipe.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              Discussing pipe operators. | ||||
| # | ||||
| # Version:  1.0 | ||||
| # | ||||
| # Date:     2021  10 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.0    New code | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| #   - find more interesting examples | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                            Line | ||||
| #TOC> ------------------------------------------------ | ||||
| #TOC>   1        Pipe  Concept                      41 | ||||
| #TOC>   2        Nested Expression                  73 | ||||
| #TOC>   3        magrittr:: Pipe                    78 | ||||
| #TOC>   4        Base R Pipe                        93 | ||||
| #TOC>   5        Intermediate Assignment           108 | ||||
| #TOC>   6        Postscript                        127 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Pipe  Concept  ======================================================= | ||||
|  | ||||
| # Pipes are actually an awesome idea for any code that implements a workflow - | ||||
| # a sequence of operations, each of which transforms data in a specialized way. | ||||
| # | ||||
| # This principle is familiar from maths: chained functions. If have a function | ||||
| # y = f(x) and want to use those results as in z = g(y), I can just write | ||||
| # z = g(f(x)) | ||||
| # | ||||
| # On the unix command line, pipes were used from the very beginning, implemented | ||||
| # with the "|" pipe character. | ||||
| # | ||||
| # In R, the magrittr package provided the %>% operator, and recently the |> | ||||
| # operator has been introduced into base R. | ||||
| # | ||||
| # However there are alternatives: intermediate assignment, and nested functions | ||||
| # that have always existed in base R anyway. | ||||
| # | ||||
| # Let us look at an example. In writing this, I found out that virtually | ||||
| # ALL non-trivial examples I came up with don't translate well into this idiom | ||||
| # at all. It is actually quite limited to simple filtering operations on | ||||
| # data. A more interesting example might be added in the future, let me know if | ||||
| # you have a good idea. | ||||
| # | ||||
| # A somewhat contrived example is to sort a list of files by the | ||||
| # length of the file names: | ||||
|  | ||||
| myFiles <- list.files(pattern = "\\.R$") | ||||
|  | ||||
| # nchar() gives the number of characters in a string, order() produces indices | ||||
| # that map an array to its sorted form. | ||||
| # | ||||
| # =    2  Nested Expression  =================================================== | ||||
|  | ||||
| myFiles[order(nchar(myFiles))] | ||||
|  | ||||
|  | ||||
| # =    3  magrittr:: Pipe  ===================================================== | ||||
|  | ||||
| if (! requireNamespace("magrittr", quietly = TRUE)) { | ||||
|   install.packages("magrittr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = magrittr)       # basic information | ||||
| #  browseVignettes("magrittr")    # available vignettes | ||||
| #  data(package = "magrittr")     # available datasets | ||||
|  | ||||
|  | ||||
| library(magrittr) | ||||
|  | ||||
| myFiles  %>% nchar %>% order %>% myFiles[.] | ||||
|  | ||||
| # =    4  Base R Pipe  ========================================================= | ||||
|  | ||||
| # Since version 4.1, base R now supports a pipe operator without the need | ||||
| # to load a special package. Such an introductions of external functionality | ||||
| # into the language is very rare. | ||||
| # | ||||
| # Unfortunately it won't (yet) work with the '[' function, so we need to write | ||||
| # an intermediate function for this example | ||||
| extract <- function(x, v) { | ||||
|   return(v[x]) | ||||
| } | ||||
|  | ||||
| myFiles |> nchar() |> order() |> extract(myFiles) | ||||
|  | ||||
|  | ||||
| # =    5  Intermediate Assignment  ============================================= | ||||
|  | ||||
| # So what's the problem? As you can see, the piped code may be concise and | ||||
| # expressive. But there is also a large amount of implicit assignment and | ||||
| # processing going on and that is usually a bad idea because it makes code hard | ||||
| # to maintain. I am NOT a big fan of the nested syntax, but I don't think that | ||||
| # replacing it with the pipe makes things much better. My preferred idiom is | ||||
| # to use intermediate assignments. Only then is it convenient to examine | ||||
| # the code step by step and validate every single step. And that is the most | ||||
| # important objective at all: no code is good if it does not compute | ||||
| # correctly. | ||||
|  | ||||
|  | ||||
| x <- nchar(myFiles) | ||||
| x <- order(x) | ||||
| myFiles[x] | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    6  Postscript  ========================================================== | ||||
|  | ||||
| # I tried to write an example that strips all comments from a list of files, and | ||||
| # another example that finds all files that were not yet updated this year | ||||
| # (according to the "# Date: in the header). Neither examples can be well | ||||
| # written without intermediate assignments, or at least sapply() functions | ||||
| # that are not simpler at all than the intermediate assignment. | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										360
									
								
								RPR-RegEx.R
									
									
									
									
									
								
							
							
						
						
									
										360
									
								
								RPR-RegEx.R
									
									
									
									
									
								
							| @@ -1,180 +1,180 @@ | ||||
| # tocID <- "RPR-RegEx.R" | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-RegEx unit | ||||
| # | ||||
| # Version: 1.0 | ||||
| # | ||||
| # Date:    2017-08  -  2020-09 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # V 0.1    Maintenance 2020 | ||||
| # V 0.1    First code | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # DO NOT SIMPLY  source()  THESE FILES! | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                Line | ||||
| #TOC> ---------------------------------------------------- | ||||
| #TOC>   1        A regex example                        41 | ||||
| #TOC>   2        Counting lines                        108 | ||||
| #TOC>   2.1        Counting C-alpha atoms only         126 | ||||
| #TOC>   3        Code Solutions                        142 | ||||
| #TOC>   3.1        Counting atoms                      144 | ||||
| #TOC>   3.2        Counting C-alpha records            160 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  A regex example  ===================================================== | ||||
|  | ||||
| # The canonical FASTA version of yeast Mbp1 at Uniprot | ||||
| s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1 | ||||
| MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK | ||||
| ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA | ||||
| SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR | ||||
| KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ | ||||
| QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS | ||||
| PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY | ||||
| FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS | ||||
| IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP | ||||
| SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT | ||||
| ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP | ||||
| VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK | ||||
| IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR | ||||
| QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK | ||||
| IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA" | ||||
|  | ||||
| nchar(s) | ||||
| # Must be 969 | ||||
|  | ||||
| # Task: Fetch the Uniprot ID by retrieving the first string that appears between | ||||
| # two vertical bars ("pipes") in the header record. | ||||
| # | ||||
|  | ||||
| # Develop the regular expression: | ||||
|                       # Just five characters returned, so we know we are using | ||||
| patt <- "^>(.{5})"    # the right functions | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
| patt <- "^>(.*)|"    # everything to the pipe character | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
| # Ooops - "|" is a metacharacter - we must escape it | ||||
|  | ||||
| patt <- "^>(.*)\|"    # using "\|" | ||||
| # Ooops - that's not how we escape: must double the \ to send a literal | ||||
| # "\" plus the character "|" to the regex engine. | ||||
|  | ||||
| patt <- "^>(.*)\\|"    # using "\\|" | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
| # Good. Now let's first match everything that is not a "|", then match a "|" | ||||
| patt <- "^>([^|]*)\\|" | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
| # the same thing again, but capture the second match. And insist that there | ||||
| # must be at least one character captured | ||||
|  | ||||
| patt <- "^>[^|]*\\|([^|]+)\\|" | ||||
| # Analyze this pattern: | ||||
| #    ^           anchor the match at the beginning of the line | ||||
| #    >           ">" must be the first character | ||||
| #    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because | ||||
| #                  we don't know what other versions of the string "sp" | ||||
| #                  might appear. Note that within the brackets "|" is NOT a | ||||
| #                  metacharacter. | ||||
| #    \\|         "|" character: ouside of square brackets "|" is a metacharacter | ||||
| #                  and means "OR"; we need to escape it to match a literal "|". | ||||
| #    (           open parenthesis: capture what comes next ... | ||||
| #       [^|]+    all-characters-except-a-vertical-bar, 1 or more times | ||||
| #    )           close parenthesis: stop capturing here | ||||
| #    \\|           second "|" character, escaped | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
|  | ||||
| # =    2  Counting lines  ====================================================== | ||||
|  | ||||
| # Task: Write a function that returns the number of atoms in a PDB file. Call it | ||||
| #       atomCount(). Sample data is here: | ||||
| myPDB <- readLines("./data/0TST.pdb") | ||||
|  | ||||
| #       Specification: | ||||
| #       Read a file from its path given as the only argument. | ||||
| #       Return the number of lines in that file that begin with "ATOM  " | ||||
| #       or with "HETATM". | ||||
|  | ||||
| #       Try this. Write a function. Solution code is at the end of this file. | ||||
| #       Don't peek. | ||||
|  | ||||
| atomCount("./data/0TST.pdb")  # must return 6 | ||||
|  | ||||
|  | ||||
|  | ||||
| # ==   2.1  Counting C-alpha atoms only  ======================================= | ||||
|  | ||||
| # Task: write a function based on the previous one that matches only CA records, | ||||
| #       i.e. it can be used to count the number of amino acids. Don't get | ||||
| #       fooled by calcium atoms, or the string CA appearing elsewhere. | ||||
| #       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM | ||||
|  | ||||
| #       Specification: | ||||
| #       Read a file from its path given as the only argument. | ||||
| #       Return the number of lines in that file that have a C-alpha atom. | ||||
|  | ||||
| #       Try this. Solution code is at the end of this file. Don't peek. | ||||
|  | ||||
| CAcount("./data/0TST.pdb")  # must return 1 | ||||
|  | ||||
|  | ||||
| # =    3  Code Solutions  ====================================================== | ||||
|  | ||||
| # ==   3.1  Counting atoms  ==================================================== | ||||
|  | ||||
| atomCount <- function(IN) { | ||||
|   # count the number of atoms in a PDB formatted file | ||||
|   # Parameters: | ||||
|   #     IN  chr  path of the file to read | ||||
|   # Value: | ||||
|   #         numeric  number of lines that match "^ATOM  " or "^HETATM" | ||||
|   # Note: the regex MUST be anchored to the beginning of the line, otherwise | ||||
|   # it might match somewhere in a comment! | ||||
|   x <- readLines(IN) | ||||
|   patt <- "(^ATOM  )|(^HETATM)" | ||||
|   return(length(grep(patt, x))) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ==   3.2  Counting C-alpha records  ========================================== | ||||
|  | ||||
|  | ||||
| CAcount <- function(IN) { | ||||
|   # count the number of C-alpha atoms in a PDB formatted file | ||||
|   # Parameters: | ||||
|   #     IN  chr  path of the file to read | ||||
|   # Value: | ||||
|   #         numeric  number of lines that match " CA " in position 13 - 16 of | ||||
|   #                  an ATOM record. | ||||
|   # Note: the regex MUST be aligned into the right position, otherwise it | ||||
|   #       might match Calcium records! | ||||
|   x <- readLines(IN) | ||||
|   patt <- "^ATOM  ...... CA " | ||||
|   return(length(grep(patt, x))) | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-RegEx.R" | ||||
| # | ||||
| # Purpose: A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-RegEx unit | ||||
| # | ||||
| # Version: 1.0 | ||||
| # | ||||
| # Date:    2017-08  -  2020-09 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # V 0.1    Maintenance 2020 | ||||
| # V 0.1    First code | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||
| # | ||||
| # DO NOT SIMPLY  source()  THESE FILES! | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| #  going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC> | ||||
| #TOC>   Section  Title                                Line | ||||
| #TOC> ---------------------------------------------------- | ||||
| #TOC>   1        A regex example                        41 | ||||
| #TOC>   2        Counting lines                        108 | ||||
| #TOC>   2.1        Counting C-alpha atoms only         126 | ||||
| #TOC>   3        Code Solutions                        142 | ||||
| #TOC>   3.1        Counting atoms                      144 | ||||
| #TOC>   3.2        Counting C-alpha records            160 | ||||
| #TOC> | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  A regex example  ===================================================== | ||||
|  | ||||
| # The canonical FASTA version of yeast Mbp1 at Uniprot | ||||
| s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1 | ||||
| MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK | ||||
| ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA | ||||
| SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR | ||||
| KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ | ||||
| QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS | ||||
| PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY | ||||
| FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS | ||||
| IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP | ||||
| SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT | ||||
| ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP | ||||
| VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK | ||||
| IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR | ||||
| QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK | ||||
| IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA" | ||||
|  | ||||
| nchar(s) | ||||
| # Must be 969 | ||||
|  | ||||
| # Task: Fetch the Uniprot ID by retrieving the first string that appears between | ||||
| # two vertical bars ("pipes") in the header record. | ||||
| # | ||||
|  | ||||
| # Develop the regular expression: | ||||
|                       # Just five characters returned, so we know we are using | ||||
| patt <- "^>(.{5})"    # the right functions | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
| patt <- "^>(.*)|"    # everything to the pipe character | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
| # Ooops - "|" is a metacharacter - we must escape it | ||||
|  | ||||
| patt <- "^>(.*)\|"    # using "\|" | ||||
| # Ooops - that's not how we escape: must double the \ to send a literal | ||||
| # "\" plus the character "|" to the regex engine. | ||||
|  | ||||
| patt <- "^>(.*)\\|"    # using "\\|" | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
| # Good. Now let's first match everything that is not a "|", then match a "|" | ||||
| patt <- "^>([^|]*)\\|" | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
| # the same thing again, but capture the second match. And insist that there | ||||
| # must be at least one character captured | ||||
|  | ||||
| patt <- "^>[^|]*\\|([^|]+)\\|" | ||||
| # Analyze this pattern: | ||||
| #    ^           anchor the match at the beginning of the line | ||||
| #    >           ">" must be the first character | ||||
| #    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because | ||||
| #                  we don't know what other versions of the string "sp" | ||||
| #                  might appear. Note that within the brackets "|" is NOT a | ||||
| #                  metacharacter. | ||||
| #    \\|         "|" character: ouside of square brackets "|" is a metacharacter | ||||
| #                  and means "OR"; we need to escape it to match a literal "|". | ||||
| #    (           open parenthesis: capture what comes next ... | ||||
| #       [^|]+    all-characters-except-a-vertical-bar, 1 or more times | ||||
| #    )           close parenthesis: stop capturing here | ||||
| #    \\|           second "|" character, escaped | ||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||
|  | ||||
|  | ||||
| # =    2  Counting lines  ====================================================== | ||||
|  | ||||
| # Task: Write a function that returns the number of atoms in a PDB file. Call it | ||||
| #       atomCount(). Sample data is here: | ||||
| myPDB <- readLines("./data/0TST.pdb") | ||||
|  | ||||
| #       Specification: | ||||
| #       Read a file from its path given as the only argument. | ||||
| #       Return the number of lines in that file that begin with "ATOM  " | ||||
| #       or with "HETATM". | ||||
|  | ||||
| #       Try this. Write a function. Solution code is at the end of this file. | ||||
| #       Don't peek. | ||||
|  | ||||
| atomCount("./data/0TST.pdb")  # must return 6 | ||||
|  | ||||
|  | ||||
|  | ||||
| # ==   2.1  Counting C-alpha atoms only  ======================================= | ||||
|  | ||||
| # Task: write a function based on the previous one that matches only CA records, | ||||
| #       i.e. it can be used to count the number of amino acids. Don't get | ||||
| #       fooled by calcium atoms, or the string CA appearing elsewhere. | ||||
| #       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM | ||||
|  | ||||
| #       Specification: | ||||
| #       Read a file from its path given as the only argument. | ||||
| #       Return the number of lines in that file that have a C-alpha atom. | ||||
|  | ||||
| #       Try this. Solution code is at the end of this file. Don't peek. | ||||
|  | ||||
| CAcount("./data/0TST.pdb")  # must return 1 | ||||
|  | ||||
|  | ||||
| # =    3  Code Solutions  ====================================================== | ||||
|  | ||||
| # ==   3.1  Counting atoms  ==================================================== | ||||
|  | ||||
| atomCount <- function(IN) { | ||||
|   # count the number of atoms in a PDB formatted file | ||||
|   # Parameters: | ||||
|   #     IN  chr  path of the file to read | ||||
|   # Value: | ||||
|   #         numeric  number of lines that match "^ATOM  " or "^HETATM" | ||||
|   # Note: the regex MUST be anchored to the beginning of the line, otherwise | ||||
|   # it might match somewhere in a comment! | ||||
|   x <- readLines(IN) | ||||
|   patt <- "(^ATOM  )|(^HETATM)" | ||||
|   return(length(grep(patt, x))) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ==   3.2  Counting C-alpha records  ========================================== | ||||
|  | ||||
|  | ||||
| CAcount <- function(IN) { | ||||
|   # count the number of C-alpha atoms in a PDB formatted file | ||||
|   # Parameters: | ||||
|   #     IN  chr  path of the file to read | ||||
|   # Value: | ||||
|   #         numeric  number of lines that match " CA " in position 13 - 16 of | ||||
|   #                  an ATOM record. | ||||
|   # Note: the regex MUST be aligned into the right position, otherwise it | ||||
|   #       might match Calcium records! | ||||
|   x <- readLines(IN) | ||||
|   patt <- "^ATOM  ...... CA " | ||||
|   return(length(grep(patt, x))) | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										1658
									
								
								RPR-SX-PDB.R
									
									
									
									
									
								
							
							
						
						
									
										1658
									
								
								RPR-SX-PDB.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,135 +1,135 @@ | ||||
| # tocID <- "RPR-UniProt_GET.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and | ||||
| #                  added FASTA headers as attribute | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0    First ABC units version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                      Line | ||||
| #TOC> ---------------------------------------------------------- | ||||
| #TOC>   1        UniProt files via GET                        43 | ||||
| #TOC>   1.1        Task - fetchUniProtSeq() function         105 | ||||
| #TOC>   2        Task solutions                              118 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  UniProt files via GET  =============================================== | ||||
|  | ||||
|  | ||||
| # Perhaps the simplest example of scripted download is to retrieve a protein | ||||
| # FASTA sequence from UniProt. All we need is to construct an URL with the | ||||
| # correct UniProt ID. | ||||
|  | ||||
| # An interface between R scripts and Web servers is provided by the httr:: | ||||
| # package. This sends and receives information via the http protocol, just like | ||||
| # a Web browser. Since this is a short and simple request, the GET verb is the | ||||
| # right tool: | ||||
|  | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = httr)       # basic information | ||||
| #  browseVignettes("httr")    # available vignettes | ||||
| #  data(package = "httr")     # available datasets | ||||
|  | ||||
|  | ||||
| # The UniProt ID for Mbp1 is ... | ||||
|  | ||||
| UniProtID <- "P39678" | ||||
|  | ||||
| # and the base URL to retrieve data is  ... | ||||
| # http://www.uniprot.org/uniprot/ . We can construct a simple URL to | ||||
| # retrieve a FASTA sequence: | ||||
|  | ||||
| (URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)) | ||||
|  | ||||
| # the GET() function from httr will get the data. | ||||
| response <- httr::GET(URL) | ||||
|  | ||||
| str(response) # the response object is a bit complex ... | ||||
| as.character(response) # ... but it is easy to pull out the data. | ||||
|  | ||||
| # to process  ... | ||||
| x <- as.character(response) | ||||
| x <- strsplit(x, "\n") | ||||
| dbSanitizeSequence(x) | ||||
|  | ||||
| # Simple. | ||||
| # But what happens if there is an error, e.g. the uniprot ID does not exist? | ||||
|  | ||||
| response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta") | ||||
| as.character(response) | ||||
| # this is a large HTML page that tells us the URL was not found. So we need to | ||||
| # check for errors.  The Right Way to do this is to evaluate the staus code that | ||||
| # every Web server returns for every transaction. | ||||
| # | ||||
| httr::status_code(response)  # 404 == Page Not Found | ||||
|  | ||||
| # There are many possible codes, but the only code we will be happy with | ||||
| # is 200 - oK. | ||||
| # (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes ) | ||||
|  | ||||
| URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID) | ||||
| response <- httr::GET(URL) | ||||
| httr::status_code(response) | ||||
|  | ||||
|  | ||||
| # ==   1.1  Task - fetchUniProtSeq() function  ================================= | ||||
|  | ||||
| # Task: write a function that | ||||
| #   - takes as input a vector of UniProt IDs, | ||||
| #   - fetches the FASTA sequence for each | ||||
| #   - returns a vector of the same length as the input, where an element is: | ||||
| #   -  ...  the sequence, if the query was successful | ||||
| #   -  ...  NA if there was an error | ||||
| #   - each element has the UniProt ID as the name() | ||||
| #   - bonus: the output has an attribute "headers" that is a vector of the | ||||
| #            FASTA headers ( cf. ?attr ) | ||||
|  | ||||
|  | ||||
| # =    2  Task solutions  ====================================================== | ||||
|  | ||||
|  | ||||
| # I have placed such a function - dbFetchUniProtSeq() - into | ||||
| # "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq() | ||||
| # in the Environment pane. | ||||
|  | ||||
| # Test this: | ||||
| ( x <- dbFetchUniProtSeq("P39678") ) | ||||
| names(x)[1] | ||||
| attr(x, "headers")[1] | ||||
| x[1] | ||||
| cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]), | ||||
|                width = 40), sep = "\n") | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-UniProt_GET.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and | ||||
| #                  added FASTA headers as attribute | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0    First ABC units version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                      Line | ||||
| #TOC> ---------------------------------------------------------- | ||||
| #TOC>   1        UniProt files via GET                        43 | ||||
| #TOC>   1.1        Task - fetchUniProtSeq() function         105 | ||||
| #TOC>   2        Task solutions                              118 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  UniProt files via GET  =============================================== | ||||
|  | ||||
|  | ||||
| # Perhaps the simplest example of scripted download is to retrieve a protein | ||||
| # FASTA sequence from UniProt. All we need is to construct an URL with the | ||||
| # correct UniProt ID. | ||||
|  | ||||
| # An interface between R scripts and Web servers is provided by the httr:: | ||||
| # package. This sends and receives information via the http protocol, just like | ||||
| # a Web browser. Since this is a short and simple request, the GET verb is the | ||||
| # right tool: | ||||
|  | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = httr)       # basic information | ||||
| #  browseVignettes("httr")    # available vignettes | ||||
| #  data(package = "httr")     # available datasets | ||||
|  | ||||
|  | ||||
| # The UniProt ID for Mbp1 is ... | ||||
|  | ||||
| UniProtID <- "P39678" | ||||
|  | ||||
| # and the base URL to retrieve data is  ... | ||||
| # http://www.uniprot.org/uniprot/ . We can construct a simple URL to | ||||
| # retrieve a FASTA sequence: | ||||
|  | ||||
| (URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)) | ||||
|  | ||||
| # the GET() function from httr will get the data. | ||||
| response <- httr::GET(URL) | ||||
|  | ||||
| str(response) # the response object is a bit complex ... | ||||
| as.character(response) # ... but it is easy to pull out the data. | ||||
|  | ||||
| # to process  ... | ||||
| x <- as.character(response) | ||||
| x <- strsplit(x, "\n") | ||||
| dbSanitizeSequence(x) | ||||
|  | ||||
| # Simple. | ||||
| # But what happens if there is an error, e.g. the uniprot ID does not exist? | ||||
|  | ||||
| response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta") | ||||
| as.character(response) | ||||
| # this is a large HTML page that tells us the URL was not found. So we need to | ||||
| # check for errors.  The Right Way to do this is to evaluate the staus code that | ||||
| # every Web server returns for every transaction. | ||||
| # | ||||
| httr::status_code(response)  # 404 == Page Not Found | ||||
|  | ||||
| # There are many possible codes, but the only code we will be happy with | ||||
| # is 200 - oK. | ||||
| # (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes ) | ||||
|  | ||||
| URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID) | ||||
| response <- httr::GET(URL) | ||||
| httr::status_code(response) | ||||
|  | ||||
|  | ||||
| # ==   1.1  Task - fetchUniProtSeq() function  ================================= | ||||
|  | ||||
| # Task: write a function that | ||||
| #   - takes as input a vector of UniProt IDs, | ||||
| #   - fetches the FASTA sequence for each | ||||
| #   - returns a vector of the same length as the input, where an element is: | ||||
| #   -  ...  the sequence, if the query was successful | ||||
| #   -  ...  NA if there was an error | ||||
| #   - each element has the UniProt ID as the name() | ||||
| #   - bonus: the output has an attribute "headers" that is a vector of the | ||||
| #            FASTA headers ( cf. ?attr ) | ||||
|  | ||||
|  | ||||
| # =    2  Task solutions  ====================================================== | ||||
|  | ||||
|  | ||||
| # I have placed such a function - dbFetchUniProtSeq() - into | ||||
| # "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq() | ||||
| # in the Environment pane. | ||||
|  | ||||
| # Test this: | ||||
| ( x <- dbFetchUniProtSeq("P39678") ) | ||||
| names(x)[1] | ||||
| attr(x, "headers")[1] | ||||
| x[1] | ||||
| cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]), | ||||
|                width = 40), sep = "\n") | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,234 +1,234 @@ | ||||
| # tocID <- "RPR-Unit_testing.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Unit_testing unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017  10  -  2019  01 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Updates. Discuss local tests. | ||||
| #           1.1    Change from require() to requireNamespace() | ||||
| #           1.0    New code | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                             Line | ||||
| #TOC> ------------------------------------------------- | ||||
| #TOC>   1        Unit Tests with testthat            42 | ||||
| #TOC>   2        Organizing your tests              165 | ||||
| #TOC>   2.1        Testing scripts                  189 | ||||
| #TOC>   2.2        Rethinking testing               202 | ||||
| #TOC>   3        Task solutions                     220 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Unit Tests with testthat  ============================================ | ||||
|  | ||||
| # The testthat package supports writing and executing unit tests in many ways. | ||||
|  | ||||
| if (! requireNamespace("testthat", quietly = TRUE)) { | ||||
|   install.packages("testthat") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = testthat)       # basic information | ||||
| #  browseVignettes("testthat")    # available vignettes | ||||
| #  data(package = "testthat")     # available datasets | ||||
|  | ||||
| # testthat is one of those packages that we either use A LOT in a script, | ||||
| # or not at all. Therefore it's more reasonable to depart from our usual | ||||
| # <package>::<function>() idiom, and load the entire library. In fact, if | ||||
| # we author packages, it is common practice to load testthat in the part | ||||
| # of the package that automates testing. | ||||
|  | ||||
| library(testthat) | ||||
|  | ||||
| # An atomic test consists of an expectation about the bahaviour of a function or | ||||
| # the existence of an object. testthat provides a number of useful expectations: | ||||
|  | ||||
| # At the most basic level, you can use expect_true() and expect_false(): | ||||
|  | ||||
| expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa")) | ||||
| expect_true(file.exists("NO-SUCH-FILE.txt")) | ||||
|  | ||||
| expect_false(is.integer(NA)) | ||||
|  | ||||
| # More commonly, you will test for equality of an output with a given result. | ||||
| # But you need to consider what it means for two numbers to be "equal" on a | ||||
| # digital computer. Consider: | ||||
|  | ||||
| 49*(1/49) == 1      # Surprised? Read FAQ 7.31 | ||||
|                     # https://cran.r-project.org/doc/FAQ/R-FAQ.html | ||||
| 49*(1/49) - 1       # NOT zero (but almost) | ||||
|  | ||||
| # This is really unpredictable ... | ||||
| 0.1 + 0.05 == 0.15 | ||||
| 0.2 + 0.07 == 0.27 | ||||
|  | ||||
| # It's easy to be caught on the wrong foot with numeric comparisons, therefore | ||||
| # R uses the function all.equal() to test whether two numbers are equal for | ||||
| # practical puposes up to machine precision. | ||||
| 49*(1/49) == 1 | ||||
| all.equal(49*(1/49), 1) | ||||
|  | ||||
| # The testthat function expect_equal() uses all.equal internally: | ||||
| expect_equal(49*(1/49), 1) | ||||
|  | ||||
| # ... which is reasonable, or, if things MUST be exactly the same ... | ||||
| expect_identical(49*(1/49), 1) | ||||
|  | ||||
| # ... but consider: | ||||
| expect_identical(2, 2L) # one is typeof() "double", the other is integer" | ||||
|  | ||||
| # Some very useful expectations are expect_warning(), and expect_error(), for | ||||
| # constructing tests that check for erroneous output: | ||||
|  | ||||
| as.integer(c("1", "2", "three")) | ||||
| expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT | ||||
|                                                  # printed. | ||||
| 1/"x" | ||||
| expect_warning(1/"x") | ||||
| expect_error(1/"x")      # Again: note that the error is NOT printed, as well | ||||
|                          # code execution will continue. | ||||
|  | ||||
| # Even better, you can check if the warning or error is what you expect it | ||||
| # to be - because it could actually have occured somewhere else in your code. | ||||
|  | ||||
| v <- c("1", "x") | ||||
| log(v[1:2]) | ||||
| expect_error(log(v[1:2]), "non-numeric argument to mathematical function") | ||||
| expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message. | ||||
| expect_error(log(v[1,2]))                # This appears oK, but ... | ||||
| expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error! | ||||
|  | ||||
| # Producing unit tests simply means: we define a function, and then we check | ||||
| # whether all test pass. Consider a function that is loaded on startup from | ||||
| # the .utilities.R script: | ||||
|  | ||||
| biCode | ||||
|  | ||||
| # We could test it like so: | ||||
|  | ||||
| expect_equal(biCode(""), ".....") | ||||
| expect_equal(biCode(" "), ".....") | ||||
| expect_equal(biCode("123 12"), ".....") | ||||
| expect_equal(biCode("h sapiens"), "H..SA") | ||||
| expect_equal(biCode("homo sapiens"), "HOMSA") | ||||
| expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") | ||||
| expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), | ||||
|              c("PHACI", "MACRU")) | ||||
| expect_error(biCode(), "argument \"s\" is missing, with no default") | ||||
|  | ||||
| # The test_that() function allows to group related tests, include an informative | ||||
| # message which test is being executed, and run a number of tests that are | ||||
| # passed to the function inside a code block - i.e. {...} | ||||
| # test_that("<descriptive string>, {<code block>}) | ||||
|  | ||||
| test_that("NA values are preserved", { | ||||
|   # bicode() respects vector length: input and output must have the smae length. | ||||
|   # Therefore NA's can't be simply skipped, bust must be properly passed | ||||
|   # into output: | ||||
|   expect_true(is.na((biCode(NA)))) | ||||
|   expect_equal(biCode(c("first", NA, "last")), | ||||
|                c("FIRST", NA, "LAST.")) | ||||
| }) | ||||
|  | ||||
|  | ||||
| # Task: Write a function calcGC() that calculates GC content in a sequence. | ||||
| #       Hint: you could strsplit() the sequence into a vector, and count | ||||
| #       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove | ||||
| #       A's and T's, and use nchar() before and after to calculate the content | ||||
| #       from the length difference. | ||||
| #       Then write tests that: | ||||
| #          confirm that calcGC("AATT") is 0; | ||||
| #          confirm that calcGC("ATGC") is 0.5; | ||||
| #          confirm that calcGC("AC")   is 0.5; | ||||
| #          confirm that calcGC("CGCG") is 1; | ||||
|  | ||||
|  | ||||
| # =    2  Organizing your tests  =============================================== | ||||
|  | ||||
|  | ||||
| # Tests are only useful if they are actually executed and we need to make sure | ||||
| # there are no barriers to do that. The testthat package supports automatic | ||||
| # execution of tests: | ||||
| #  - put your tests into an R-script, | ||||
| #  - save your tests in a file called "test_<my-function-name>.R" | ||||
| #  - execute the test with test_file("test_<my-function-name>.R") ... | ||||
| #  ... or, if you are working on a project ... | ||||
| #  - place the file in a test-directory (e.g. the directory "test" in this | ||||
| #      project), | ||||
| #  - execute all your tests with test_dir("<my-test-directory>") | ||||
|  | ||||
| # For example I have provided a "tests" directory with this project, and | ||||
| # placed the file "test_biCode.R" inside. | ||||
| file.show("./tests/test_biCode.R") | ||||
|  | ||||
| # Execute the file ... | ||||
| test_file("./tests/test_biCode.R") | ||||
|  | ||||
| # .. or execute all the test files in the directory: | ||||
| test_dir("./tests") | ||||
|  | ||||
| # ==   2.1  Testing scripts  =================================================== | ||||
|  | ||||
| # Scripts need special consideration since we do not necessarily source() them | ||||
| # entirely. Therefore automated testing is not reasonable. What you can do | ||||
| # instead is to place a conditional block at the end of your script, that | ||||
| # never gets executed - then you can manually execute the code in the block | ||||
| # whenever you wish to test your functions. For example: | ||||
|  | ||||
| if (FALSE) { | ||||
|   # ... your tests go here | ||||
|  | ||||
| } | ||||
|  | ||||
| # ==   2.2  Rethinking testing  ================================================ | ||||
|  | ||||
| # However, it is important to keep in mind that different objectives lead to | ||||
| # different ideas of what works best. There is never a "best" in and of itself, | ||||
| # the question is always: "Best for what?" While automated unit testing is a | ||||
| # great way to assure the integrity of packages and larger software artefacts as | ||||
| # they are being developed, more loosely conceived aggregates of code - like the | ||||
| # scripts for this course for example - have different objectives and in this | ||||
| # case I find the testthat approach to actually be inferior. The reason is its | ||||
| # tendency to physically separate code and tests. Keeping assets, and functions | ||||
| # that operate on those assets separated is always poor design. I have found | ||||
| # over time that a more stable approach is to move individual functions into | ||||
| # their individual scripts, all in one folder, one function (and its helpers) | ||||
| # per file, and examples, demos and tests in an if (FALSE) { ... } block, as | ||||
| # explained above. | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    3  Task solutions  ====================================================== | ||||
|  | ||||
| calcGC <- function(s) { | ||||
|   s <- gsub("[^agctAGCT]", "", s) | ||||
|   return(nchar(gsub("[atAT]", "", s)) / nchar(s)) | ||||
| } | ||||
|  | ||||
| expect_equal(calcGC("AATT"), 0) | ||||
| expect_equal(calcGC("ATGC"), 0.5) | ||||
| expect_equal(calcGC("AC"),   0.5) | ||||
| expect_equal(calcGC("CGCG"), 1) | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-Unit_testing.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Unit_testing unit. | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017  10  -  2019  01 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Updates. Discuss local tests. | ||||
| #           1.1    Change from require() to requireNamespace() | ||||
| #           1.0    New code | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                             Line | ||||
| #TOC> ------------------------------------------------- | ||||
| #TOC>   1        Unit Tests with testthat            42 | ||||
| #TOC>   2        Organizing your tests              165 | ||||
| #TOC>   2.1        Testing scripts                  189 | ||||
| #TOC>   2.2        Rethinking testing               202 | ||||
| #TOC>   3        Task solutions                     220 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Unit Tests with testthat  ============================================ | ||||
|  | ||||
| # The testthat package supports writing and executing unit tests in many ways. | ||||
|  | ||||
| if (! requireNamespace("testthat", quietly = TRUE)) { | ||||
|   install.packages("testthat") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = testthat)       # basic information | ||||
| #  browseVignettes("testthat")    # available vignettes | ||||
| #  data(package = "testthat")     # available datasets | ||||
|  | ||||
| # testthat is one of those packages that we either use A LOT in a script, | ||||
| # or not at all. Therefore it's more reasonable to depart from our usual | ||||
| # <package>::<function>() idiom, and load the entire library. In fact, if | ||||
| # we author packages, it is common practice to load testthat in the part | ||||
| # of the package that automates testing. | ||||
|  | ||||
| library(testthat) | ||||
|  | ||||
| # An atomic test consists of an expectation about the bahaviour of a function or | ||||
| # the existence of an object. testthat provides a number of useful expectations: | ||||
|  | ||||
| # At the most basic level, you can use expect_true() and expect_false(): | ||||
|  | ||||
| expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa")) | ||||
| expect_true(file.exists("NO-SUCH-FILE.txt")) | ||||
|  | ||||
| expect_false(is.integer(NA)) | ||||
|  | ||||
| # More commonly, you will test for equality of an output with a given result. | ||||
| # But you need to consider what it means for two numbers to be "equal" on a | ||||
| # digital computer. Consider: | ||||
|  | ||||
| 49*(1/49) == 1      # Surprised? Read FAQ 7.31 | ||||
|                     # https://cran.r-project.org/doc/FAQ/R-FAQ.html | ||||
| 49*(1/49) - 1       # NOT zero (but almost) | ||||
|  | ||||
| # This is really unpredictable ... | ||||
| 0.1 + 0.05 == 0.15 | ||||
| 0.2 + 0.07 == 0.27 | ||||
|  | ||||
| # It's easy to be caught on the wrong foot with numeric comparisons, therefore | ||||
| # R uses the function all.equal() to test whether two numbers are equal for | ||||
| # practical puposes up to machine precision. | ||||
| 49*(1/49) == 1 | ||||
| all.equal(49*(1/49), 1) | ||||
|  | ||||
| # The testthat function expect_equal() uses all.equal internally: | ||||
| expect_equal(49*(1/49), 1) | ||||
|  | ||||
| # ... which is reasonable, or, if things MUST be exactly the same ... | ||||
| expect_identical(49*(1/49), 1) | ||||
|  | ||||
| # ... but consider: | ||||
| expect_identical(2, 2L) # one is typeof() "double", the other is integer" | ||||
|  | ||||
| # Some very useful expectations are expect_warning(), and expect_error(), for | ||||
| # constructing tests that check for erroneous output: | ||||
|  | ||||
| as.integer(c("1", "2", "three")) | ||||
| expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT | ||||
|                                                  # printed. | ||||
| 1/"x" | ||||
| expect_warning(1/"x") | ||||
| expect_error(1/"x")      # Again: note that the error is NOT printed, as well | ||||
|                          # code execution will continue. | ||||
|  | ||||
| # Even better, you can check if the warning or error is what you expect it | ||||
| # to be - because it could actually have occured somewhere else in your code. | ||||
|  | ||||
| v <- c("1", "x") | ||||
| log(v[1:2]) | ||||
| expect_error(log(v[1:2]), "non-numeric argument to mathematical function") | ||||
| expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message. | ||||
| expect_error(log(v[1,2]))                # This appears oK, but ... | ||||
| expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error! | ||||
|  | ||||
| # Producing unit tests simply means: we define a function, and then we check | ||||
| # whether all test pass. Consider a function that is loaded on startup from | ||||
| # the .utilities.R script: | ||||
|  | ||||
| biCode | ||||
|  | ||||
| # We could test it like so: | ||||
|  | ||||
| expect_equal(biCode(""), ".....") | ||||
| expect_equal(biCode(" "), ".....") | ||||
| expect_equal(biCode("123 12"), ".....") | ||||
| expect_equal(biCode("h sapiens"), "H..SA") | ||||
| expect_equal(biCode("homo sapiens"), "HOMSA") | ||||
| expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") | ||||
| expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), | ||||
|              c("PHACI", "MACRU")) | ||||
| expect_error(biCode(), "argument \"s\" is missing, with no default") | ||||
|  | ||||
| # The test_that() function allows to group related tests, include an informative | ||||
| # message which test is being executed, and run a number of tests that are | ||||
| # passed to the function inside a code block - i.e. {...} | ||||
| # test_that("<descriptive string>, {<code block>}) | ||||
|  | ||||
| test_that("NA values are preserved", { | ||||
|   # bicode() respects vector length: input and output must have the smae length. | ||||
|   # Therefore NA's can't be simply skipped, bust must be properly passed | ||||
|   # into output: | ||||
|   expect_true(is.na((biCode(NA)))) | ||||
|   expect_equal(biCode(c("first", NA, "last")), | ||||
|                c("FIRST", NA, "LAST.")) | ||||
| }) | ||||
|  | ||||
|  | ||||
| # Task: Write a function calcGC() that calculates GC content in a sequence. | ||||
| #       Hint: you could strsplit() the sequence into a vector, and count | ||||
| #       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove | ||||
| #       A's and T's, and use nchar() before and after to calculate the content | ||||
| #       from the length difference. | ||||
| #       Then write tests that: | ||||
| #          confirm that calcGC("AATT") is 0; | ||||
| #          confirm that calcGC("ATGC") is 0.5; | ||||
| #          confirm that calcGC("AC")   is 0.5; | ||||
| #          confirm that calcGC("CGCG") is 1; | ||||
|  | ||||
|  | ||||
| # =    2  Organizing your tests  =============================================== | ||||
|  | ||||
|  | ||||
| # Tests are only useful if they are actually executed and we need to make sure | ||||
| # there are no barriers to do that. The testthat package supports automatic | ||||
| # execution of tests: | ||||
| #  - put your tests into an R-script, | ||||
| #  - save your tests in a file called "test_<my-function-name>.R" | ||||
| #  - execute the test with test_file("test_<my-function-name>.R") ... | ||||
| #  ... or, if you are working on a project ... | ||||
| #  - place the file in a test-directory (e.g. the directory "test" in this | ||||
| #      project), | ||||
| #  - execute all your tests with test_dir("<my-test-directory>") | ||||
|  | ||||
| # For example I have provided a "tests" directory with this project, and | ||||
| # placed the file "test_biCode.R" inside. | ||||
| file.show("./tests/test_biCode.R") | ||||
|  | ||||
| # Execute the file ... | ||||
| test_file("./tests/test_biCode.R") | ||||
|  | ||||
| # .. or execute all the test files in the directory: | ||||
| test_dir("./tests") | ||||
|  | ||||
| # ==   2.1  Testing scripts  =================================================== | ||||
|  | ||||
| # Scripts need special consideration since we do not necessarily source() them | ||||
| # entirely. Therefore automated testing is not reasonable. What you can do | ||||
| # instead is to place a conditional block at the end of your script, that | ||||
| # never gets executed - then you can manually execute the code in the block | ||||
| # whenever you wish to test your functions. For example: | ||||
|  | ||||
| if (FALSE) { | ||||
|   # ... your tests go here | ||||
|  | ||||
| } | ||||
|  | ||||
| # ==   2.2  Rethinking testing  ================================================ | ||||
|  | ||||
| # However, it is important to keep in mind that different objectives lead to | ||||
| # different ideas of what works best. There is never a "best" in and of itself, | ||||
| # the question is always: "Best for what?" While automated unit testing is a | ||||
| # great way to assure the integrity of packages and larger software artefacts as | ||||
| # they are being developed, more loosely conceived aggregates of code - like the | ||||
| # scripts for this course for example - have different objectives and in this | ||||
| # case I find the testthat approach to actually be inferior. The reason is its | ||||
| # tendency to physically separate code and tests. Keeping assets, and functions | ||||
| # that operate on those assets separated is always poor design. I have found | ||||
| # over time that a more stable approach is to move individual functions into | ||||
| # their individual scripts, all in one folder, one function (and its helpers) | ||||
| # per file, and examples, demos and tests in an if (FALSE) { ... } block, as | ||||
| # explained above. | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    3  Task solutions  ====================================================== | ||||
|  | ||||
| calcGC <- function(s) { | ||||
|   s <- gsub("[^agctAGCT]", "", s) | ||||
|   return(nchar(gsub("[atAT]", "", s)) / nchar(s)) | ||||
| } | ||||
|  | ||||
| expect_equal(calcGC("AATT"), 0) | ||||
| expect_equal(calcGC("ATGC"), 0.5) | ||||
| expect_equal(calcGC("AC"),   0.5) | ||||
| expect_equal(calcGC("CGCG"), 1) | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										332
									
								
								RPR-eUtils_XML.R
									
									
									
									
									
								
							
							
						
						
									
										332
									
								
								RPR-eUtils_XML.R
									
									
									
									
									
								
							| @@ -1,166 +1,166 @@ | ||||
| # tocID <- "RPR-eUtils_XML.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||
| # | ||||
| # Version:  1.2.1 | ||||
| # | ||||
| # Date:     2017-10  -  2021-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2.1  2021 Maintenance | ||||
| #           1.2    2020 Updates | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0    First ABC units version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                       Line | ||||
| #TOC> ----------------------------------------------------------- | ||||
| #TOC>   1        Working with NCBI eUtils                      43 | ||||
| #TOC>   1.1        Task - fetchNCBItaxData() function         145 | ||||
| #TOC>   2        Task solutions                               152 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Working with NCBI eUtils  ============================================ | ||||
|  | ||||
|  | ||||
| # To begin, we load the xml2 package that contains functions | ||||
| # we need to receive and parse html data. NCBI's eUtils send information in | ||||
| # XML format so we need to be able to parse XML. | ||||
| if (! requireNamespace("xml2", quietly=TRUE)) { | ||||
|   install.packages("xml2") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = xml2)       # basic information | ||||
| #  browseVignettes("xml2")    # available vignettes | ||||
| #  data(package = "xml2")     # available datasets | ||||
|  | ||||
|  | ||||
|  | ||||
| # We will walk through the process with the refSeqID | ||||
| # of yeast Mbp1 | ||||
| refSeqID <- "NP_010227" | ||||
|  | ||||
|  | ||||
| # First we build a query URL... | ||||
| eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" | ||||
|  | ||||
|  | ||||
| # Then we assemble an URL that will search for get the | ||||
| # unique, NCBI internal identifier, | ||||
| # for our refSeqID... | ||||
| URL <- paste(eUtilsBase, | ||||
|              "esearch.fcgi?",     # ...using the esearch program | ||||
|                                   # that finds an entry in an | ||||
|                                   # NCBI database | ||||
|              "db=protein", | ||||
|              "&term=", refSeqID, | ||||
|              sep="") | ||||
| # Copy the URL and paste it into your browser to see | ||||
| # what the response should look like. | ||||
| URL | ||||
|  | ||||
| # To fetch a response in R, we use the function read_xml() | ||||
| # with our URL as its argument. | ||||
| ( myXML <- xml2::read_xml(URL) ) | ||||
|  | ||||
| # This is XML. We can take the response apart into | ||||
| # its individual components with the as_list() function. | ||||
|  | ||||
| xml2::as_list(myXML) | ||||
|  | ||||
| # Note how the XML "tree" is represented as a list of | ||||
| # lists of lists ... | ||||
| # If we know exactly what element we are looking for, | ||||
| # we can extract it from this structure: | ||||
| xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]] | ||||
|  | ||||
| # But this is not very robust, it would break with the | ||||
| # slightest change that the NCBI makes to their data format - | ||||
| # and the NCBI changes things A LOT! | ||||
|  | ||||
| # Somewhat more robust is to specify the type of element | ||||
| # we want - its the text contained in an <Id>...</Id> | ||||
| # element, and use the XPath XML parsing language to | ||||
| # retrieve it. | ||||
|  | ||||
| xml2::xml_find_all(myXML, "//Id") # returns a "node set" | ||||
|  | ||||
| xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents | ||||
|                                                   # of the node set | ||||
|  | ||||
| # We will need to do this more than once, so we write a function | ||||
| # for it... | ||||
| node2text <- function(doc, tag) { | ||||
|   # an extractor function for the contents of elements | ||||
|   # between given tags in an XML response. | ||||
|   # Contents of all matching elements is returned in | ||||
|   # a vector of strings. | ||||
|   path <- paste0("//", tag) | ||||
|   nodes <- xml2::xml_find_all(doc, path) | ||||
|   return(xml2::xml_text(nodes)) | ||||
| } | ||||
|  | ||||
| # using node2text() ... | ||||
| (GID <- node2text(myXML, "Id")) | ||||
|  | ||||
| # The GI is the pivot for data requests at the | ||||
| # NCBI. | ||||
|  | ||||
| # Let's first get the associated data for this GI | ||||
| URL <- paste0(eUtilsBase, | ||||
|               "esummary.fcgi?", | ||||
|               "db=protein", | ||||
|               "&id=", | ||||
|               GID, | ||||
|               "&version=2.0") | ||||
| (myXML <- xml2::read_xml(URL)) | ||||
|  | ||||
| (taxID <- node2text(myXML, "TaxId")) | ||||
| (organism <- node2text(myXML, "Organism")) | ||||
|  | ||||
| #  This forms the base of a function that gets taxonomy data | ||||
| #  from an Entrez result. You can write this! | ||||
|  | ||||
|  | ||||
| # ==   1.1  Task - fetchNCBItaxData() function  ================================ | ||||
|  | ||||
| # Task: write a function that takes as input a RefSeq ID, fetches the taxonomy | ||||
| # information, returns a list with taxID and organism, if the operation is | ||||
| # successful, or a list of length 0 if there is an error. | ||||
|  | ||||
|  | ||||
| # =    2  Task solutions  ====================================================== | ||||
|  | ||||
| # I have placed such a function into the dbUtilities script: look it up by | ||||
| # clicking on  dbFetchNCBItaxData() in the Environment pane. | ||||
|  | ||||
| # Test: | ||||
| dbFetchNCBItaxData("XP_001837394") | ||||
|  | ||||
| # Expected outout: | ||||
| # ---------------- | ||||
| # taxID                         organism | ||||
| # 1 240176 Coprinopsis cinerea okayama7#130 | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "RPR-eUtils_XML.R" | ||||
| # | ||||
| # Purpose:  A Bioinformatics Course: | ||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||
| # | ||||
| # Version:  1.2.1 | ||||
| # | ||||
| # Date:     2017-10  -  2021-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2.1  2021 Maintenance | ||||
| #           1.2    2020 Updates | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0    First ABC units version | ||||
| #           0.1    First code copied from 2016 material. | ||||
| # | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # | ||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||
| # | ||||
| # If there are portions you don't understand, use R's help system, Google for an | ||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | ||||
| # going on. That's not how it works ... | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                       Line | ||||
| #TOC> ----------------------------------------------------------- | ||||
| #TOC>   1        Working with NCBI eUtils                      43 | ||||
| #TOC>   1.1        Task - fetchNCBItaxData() function         145 | ||||
| #TOC>   2        Task solutions                               152 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Working with NCBI eUtils  ============================================ | ||||
|  | ||||
|  | ||||
| # To begin, we load the xml2 package that contains functions | ||||
| # we need to receive and parse html data. NCBI's eUtils send information in | ||||
| # XML format so we need to be able to parse XML. | ||||
| if (! requireNamespace("xml2", quietly=TRUE)) { | ||||
|   install.packages("xml2") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = xml2)       # basic information | ||||
| #  browseVignettes("xml2")    # available vignettes | ||||
| #  data(package = "xml2")     # available datasets | ||||
|  | ||||
|  | ||||
|  | ||||
| # We will walk through the process with the refSeqID | ||||
| # of yeast Mbp1 | ||||
| refSeqID <- "NP_010227" | ||||
|  | ||||
|  | ||||
| # First we build a query URL... | ||||
| eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" | ||||
|  | ||||
|  | ||||
| # Then we assemble an URL that will search for get the | ||||
| # unique, NCBI internal identifier, | ||||
| # for our refSeqID... | ||||
| URL <- paste(eUtilsBase, | ||||
|              "esearch.fcgi?",     # ...using the esearch program | ||||
|                                   # that finds an entry in an | ||||
|                                   # NCBI database | ||||
|              "db=protein", | ||||
|              "&term=", refSeqID, | ||||
|              sep="") | ||||
| # Copy the URL and paste it into your browser to see | ||||
| # what the response should look like. | ||||
| URL | ||||
|  | ||||
| # To fetch a response in R, we use the function read_xml() | ||||
| # with our URL as its argument. | ||||
| ( myXML <- xml2::read_xml(URL) ) | ||||
|  | ||||
| # This is XML. We can take the response apart into | ||||
| # its individual components with the as_list() function. | ||||
|  | ||||
| xml2::as_list(myXML) | ||||
|  | ||||
| # Note how the XML "tree" is represented as a list of | ||||
| # lists of lists ... | ||||
| # If we know exactly what element we are looking for, | ||||
| # we can extract it from this structure: | ||||
| xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]] | ||||
|  | ||||
| # But this is not very robust, it would break with the | ||||
| # slightest change that the NCBI makes to their data format - | ||||
| # and the NCBI changes things A LOT! | ||||
|  | ||||
| # Somewhat more robust is to specify the type of element | ||||
| # we want - its the text contained in an <Id>...</Id> | ||||
| # element, and use the XPath XML parsing language to | ||||
| # retrieve it. | ||||
|  | ||||
| xml2::xml_find_all(myXML, "//Id") # returns a "node set" | ||||
|  | ||||
| xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents | ||||
|                                                   # of the node set | ||||
|  | ||||
| # We will need to do this more than once, so we write a function | ||||
| # for it... | ||||
| node2text <- function(doc, tag) { | ||||
|   # an extractor function for the contents of elements | ||||
|   # between given tags in an XML response. | ||||
|   # Contents of all matching elements is returned in | ||||
|   # a vector of strings. | ||||
|   path <- paste0("//", tag) | ||||
|   nodes <- xml2::xml_find_all(doc, path) | ||||
|   return(xml2::xml_text(nodes)) | ||||
| } | ||||
|  | ||||
| # using node2text() ... | ||||
| (GID <- node2text(myXML, "Id")) | ||||
|  | ||||
| # The GI is the pivot for data requests at the | ||||
| # NCBI. | ||||
|  | ||||
| # Let's first get the associated data for this GI | ||||
| URL <- paste0(eUtilsBase, | ||||
|               "esummary.fcgi?", | ||||
|               "db=protein", | ||||
|               "&id=", | ||||
|               GID, | ||||
|               "&version=2.0") | ||||
| (myXML <- xml2::read_xml(URL)) | ||||
|  | ||||
| (taxID <- node2text(myXML, "TaxId")) | ||||
| (organism <- node2text(myXML, "Organism")) | ||||
|  | ||||
| #  This forms the base of a function that gets taxonomy data | ||||
| #  from an Entrez result. You can write this! | ||||
|  | ||||
|  | ||||
| # ==   1.1  Task - fetchNCBItaxData() function  ================================ | ||||
|  | ||||
| # Task: write a function that takes as input a RefSeq ID, fetches the taxonomy | ||||
| # information, returns a list with taxID and organism, if the operation is | ||||
| # successful, or a list of length 0 if there is an error. | ||||
|  | ||||
|  | ||||
| # =    2  Task solutions  ====================================================== | ||||
|  | ||||
| # I have placed such a function into the dbUtilities script: look it up by | ||||
| # clicking on  dbFetchNCBItaxData() in the Environment pane. | ||||
|  | ||||
| # Test: | ||||
| dbFetchNCBItaxData("XP_001837394") | ||||
|  | ||||
| # Expected outout: | ||||
| # ---------------- | ||||
| # taxID                         organism | ||||
| # 1 240176 Coprinopsis cinerea okayama7#130 | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,10 +1,10 @@ | ||||
| HEADER   TEST                                                 0TST      0TST   1 | ||||
| REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2 | ||||
| ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3 | ||||
| ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4 | ||||
| ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5 | ||||
| ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6 | ||||
| TER       5      GLY     1                                              0TST   7 | ||||
| HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8 | ||||
| HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9 | ||||
| END                                                                     0TST  10 | ||||
| HEADER   TEST                                                 0TST      0TST   1 | ||||
| REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2 | ||||
| ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3 | ||||
| ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4 | ||||
| ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5 | ||||
| ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6 | ||||
| TER       5      GLY     1                                              0TST   7 | ||||
| HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8 | ||||
| HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9 | ||||
| END                                                                     0TST  10 | ||||
|   | ||||
							
								
								
									
										3104
									
								
								data/1BM8.pdb
									
									
									
									
									
								
							
							
						
						
									
										3104
									
								
								data/1BM8.pdb
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,5 +1,5 @@ | ||||
| >2F1C:X|PDBID|CHAIN|SEQUENCE | ||||
| EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN | ||||
| DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT | ||||
| FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY | ||||
| >2F1C:X|PDBID|CHAIN|SEQUENCE | ||||
| EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN | ||||
| DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT | ||||
| FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY | ||||
| GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH | ||||
							
								
								
									
										12
									
								
								data/3FG7.fa
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								data/3FG7.fa
									
									
									
									
									
								
							| @@ -1,6 +1,6 @@ | ||||
| >3FG7:A|PDBID|CHAIN|SEQUENCE | ||||
| MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK | ||||
| WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM | ||||
| VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT | ||||
| ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD | ||||
| QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN | ||||
| >3FG7:A|PDBID|CHAIN|SEQUENCE | ||||
| MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK | ||||
| WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM | ||||
| VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT | ||||
| ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD | ||||
| QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN | ||||
|   | ||||
| @@ -1,20 +1,20 @@ | ||||
| [ | ||||
|   { "name" : "MBP1_SACCE", | ||||
|     "RefSeqID" : "NP_010227", | ||||
|     "UniProtID" : "P39678", | ||||
|     "taxonomyID" : 559292, | ||||
|     "sequence" : [ | ||||
|        "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF", | ||||
|        "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET", | ||||
|        "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL", | ||||
|        "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ", | ||||
|        "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV", | ||||
|        "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", | ||||
|        "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL", | ||||
|        "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM", | ||||
|        "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ", | ||||
|        "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK", | ||||
|        "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS", | ||||
|        "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"] | ||||
|   } | ||||
| ] | ||||
| [ | ||||
|   { "name" : "MBP1_SACCE", | ||||
|     "RefSeqID" : "NP_010227", | ||||
|     "UniProtID" : "P39678", | ||||
|     "taxonomyID" : 559292, | ||||
|     "sequence" : [ | ||||
|        "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF", | ||||
|        "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET", | ||||
|        "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL", | ||||
|        "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ", | ||||
|        "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV", | ||||
|        "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", | ||||
|        "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL", | ||||
|        "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM", | ||||
|        "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ", | ||||
|        "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK", | ||||
|        "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS", | ||||
|        "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"] | ||||
|   } | ||||
| ] | ||||
|   | ||||
| @@ -1,30 +1,30 @@ | ||||
| >PTPN5-201 cds:protein_coding (ENST00000358540.7) | ||||
| ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA | ||||
| GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG | ||||
| GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT | ||||
| CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC | ||||
| TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT | ||||
| GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC | ||||
| TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG | ||||
| TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC | ||||
| CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC | ||||
| AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG | ||||
| ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG | ||||
| ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG | ||||
| GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG | ||||
| GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC | ||||
| GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG | ||||
| CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT | ||||
| GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT | ||||
| CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG | ||||
| GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC | ||||
| GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC | ||||
| ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC | ||||
| GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC | ||||
| TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC | ||||
| GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG | ||||
| GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT | ||||
| GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT | ||||
| GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG | ||||
| ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC | ||||
| CACCAGTCCCCAGAATGA | ||||
| >PTPN5-201 cds:protein_coding (ENST00000358540.7) | ||||
| ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA | ||||
| GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG | ||||
| GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT | ||||
| CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC | ||||
| TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT | ||||
| GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC | ||||
| TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG | ||||
| TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC | ||||
| CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC | ||||
| AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG | ||||
| ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG | ||||
| ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG | ||||
| GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG | ||||
| GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC | ||||
| GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG | ||||
| CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT | ||||
| GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT | ||||
| CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG | ||||
| GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC | ||||
| GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC | ||||
| ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC | ||||
| GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC | ||||
| TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC | ||||
| GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG | ||||
| GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT | ||||
| GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT | ||||
| GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG | ||||
| ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC | ||||
| CACCAGTCCCCAGAATGA | ||||
|   | ||||
| @@ -1,12 +1,12 @@ | ||||
| >RAB39B cds:protein_coding (ENST00000369454.4) | ||||
| ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC | ||||
| AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC | ||||
| GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC | ||||
| CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG | ||||
| AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC | ||||
| CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG | ||||
| GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA | ||||
| CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG | ||||
| GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT | ||||
| ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT | ||||
| TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG | ||||
| >RAB39B cds:protein_coding (ENST00000369454.4) | ||||
| ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC | ||||
| AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC | ||||
| GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC | ||||
| CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG | ||||
| AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC | ||||
| CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG | ||||
| GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA | ||||
| CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG | ||||
| GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT | ||||
| ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT | ||||
| TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG | ||||
|   | ||||
| @@ -1,131 +1,131 @@ | ||||
|  | ||||
|  | ||||
| ```{css, echo = FALSE} | ||||
|  | ||||
| .striped tr:nth-child(even) { | ||||
|   background: #eaf1ff; | ||||
| } | ||||
| .striped { | ||||
|   padding: 5px; | ||||
| } | ||||
| ``` | ||||
| <small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 --> | ||||
|  | ||||
|  | ||||
| ```{r setup, include=FALSE} | ||||
| knitr::opts_chunk$set(echo = TRUE) | ||||
| ``` | ||||
|  | ||||
| ## Phobias! ## | ||||
| We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>. | ||||
|  | ||||
| To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it. | ||||
| ```{r packages} | ||||
| if (! requireNamespace("rvest", quietly=TRUE)) { | ||||
|   install.packages("rvest") | ||||
| } | ||||
| if (! requireNamespace("xml2", quietly=TRUE)) { | ||||
|   install.packages("xml2") | ||||
| } | ||||
| ``` | ||||
| As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable. | ||||
|  | ||||
| `xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames. | ||||
|  | ||||
| ```{r getPageData, cache=TRUE} | ||||
| webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias") | ||||
| allTables <- rvest::html_table(webPage, fill = TRUE) | ||||
| ``` | ||||
|  | ||||
| There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`. | ||||
|  | ||||
| ```{r collateTables, cache=TRUE} | ||||
| phobiaTable <- data.frame(Phobia = character(), Condition = character()) | ||||
| for (i in seq_along(allTables)) { | ||||
|   df <- allTables[[i]] | ||||
|   if (all(colnames(df) == c("Phobia", "Condition"))) { | ||||
|     phobiaTable <- rbind(phobiaTable, df) | ||||
|   } | ||||
| } | ||||
| ``` | ||||
|  | ||||
| Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them. | ||||
|  | ||||
| <p>  | ||||
| <p> | ||||
|  | ||||
| ```{r , ref.label="randRow", echo=FALSE} | ||||
| ``` | ||||
|  | ||||
| **Table**: seven random phobias<br/> | ||||
| ```{r renderPhobiaTable, echo=FALSE, results='asis'} | ||||
| sel <- sample(1:nrow(phobiaTable), 7) | ||||
| knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html") | ||||
| ``` | ||||
|  | ||||
| <p>  | ||||
| <p> | ||||
| To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function. | ||||
|  | ||||
| ```{r randRow} | ||||
| randRow <- function(M, seed = FALSE) { | ||||
|   # Return a random row from a dataframe M. | ||||
|   if (seed) { | ||||
|     oldseed <- .Random.seed                # play nice and save the RNG state ... | ||||
|     set.seed(as.integer(seed)) | ||||
|   } | ||||
|   r <- M[sample(1:nrow(M), 1), ]           # fetch one random row | ||||
|   if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state | ||||
|   return(r) | ||||
| } | ||||
| ``` | ||||
| <p>  | ||||
| <p> | ||||
| With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`. | ||||
|  | ||||
| _`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful. | ||||
|  | ||||
| <p>  | ||||
| <p> | ||||
|  | ||||
| Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up. | ||||
|  | ||||
| ```{r preProcess} | ||||
|  | ||||
| # select only single-word phobias that end with "phobia" | ||||
| sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia) | ||||
| names <- phobiaTable$Phobia[sel] | ||||
|  | ||||
| # extract the ones we did _not_ select | ||||
| x <- phobiaTable$Phobia[! sel] | ||||
| # use strsplit() to split them apart and flatten the resulting list | ||||
| x <- unlist(strsplit(x, ", ")) | ||||
| x <- unlist(strsplit(x, " ")) | ||||
| x <- unlist(strsplit(x, "/")) | ||||
| # use the same selection as above, and append the result to our "names"" | ||||
| sel <- ! grepl(" ", x) & grepl(".phobia$", x) | ||||
| names <- c(names, x[sel]) | ||||
|  | ||||
| ``` | ||||
|  | ||||
| Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths. | ||||
|  | ||||
| ```{r showHist} | ||||
|  | ||||
| x <- nchar(names) | ||||
| pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ... | ||||
| pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too. | ||||
| hist(x, | ||||
|      main = "Length of phobia-names", | ||||
|      sub = sprintf("Shortest: %s (%d), Longest: %s (%d)", | ||||
|                    pShort, nchar(pShort), pLong, nchar(pLong)), | ||||
|      cex.sub = 0.8, | ||||
|      xlab = "name", | ||||
|      ylab = "counts", | ||||
|      col ="#aef5ee") | ||||
|  | ||||
| ``` | ||||
|  | ||||
| That's all. | ||||
|  | ||||
| <!-- [END] --> | ||||
|  | ||||
|  | ||||
| ```{css, echo = FALSE} | ||||
|  | ||||
| .striped tr:nth-child(even) { | ||||
|   background: #eaf1ff; | ||||
| } | ||||
| .striped { | ||||
|   padding: 5px; | ||||
| } | ||||
| ``` | ||||
| <small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 --> | ||||
|  | ||||
|  | ||||
| ```{r setup, include=FALSE} | ||||
| knitr::opts_chunk$set(echo = TRUE) | ||||
| ``` | ||||
|  | ||||
| ## Phobias! ## | ||||
| We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>. | ||||
|  | ||||
| To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it. | ||||
| ```{r packages} | ||||
| if (! requireNamespace("rvest", quietly=TRUE)) { | ||||
|   install.packages("rvest") | ||||
| } | ||||
| if (! requireNamespace("xml2", quietly=TRUE)) { | ||||
|   install.packages("xml2") | ||||
| } | ||||
| ``` | ||||
| As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable. | ||||
|  | ||||
| `xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames. | ||||
|  | ||||
| ```{r getPageData, cache=TRUE} | ||||
| webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias") | ||||
| allTables <- rvest::html_table(webPage, fill = TRUE) | ||||
| ``` | ||||
|  | ||||
| There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`. | ||||
|  | ||||
| ```{r collateTables, cache=TRUE} | ||||
| phobiaTable <- data.frame(Phobia = character(), Condition = character()) | ||||
| for (i in seq_along(allTables)) { | ||||
|   df <- allTables[[i]] | ||||
|   if (all(colnames(df) == c("Phobia", "Condition"))) { | ||||
|     phobiaTable <- rbind(phobiaTable, df) | ||||
|   } | ||||
| } | ||||
| ``` | ||||
|  | ||||
| Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them. | ||||
|  | ||||
| <p>  | ||||
| <p> | ||||
|  | ||||
| ```{r , ref.label="randRow", echo=FALSE} | ||||
| ``` | ||||
|  | ||||
| **Table**: seven random phobias<br/> | ||||
| ```{r renderPhobiaTable, echo=FALSE, results='asis'} | ||||
| sel <- sample(1:nrow(phobiaTable), 7) | ||||
| knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html") | ||||
| ``` | ||||
|  | ||||
| <p>  | ||||
| <p> | ||||
| To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function. | ||||
|  | ||||
| ```{r randRow} | ||||
| randRow <- function(M, seed = FALSE) { | ||||
|   # Return a random row from a dataframe M. | ||||
|   if (seed) { | ||||
|     oldseed <- .Random.seed                # play nice and save the RNG state ... | ||||
|     set.seed(as.integer(seed)) | ||||
|   } | ||||
|   r <- M[sample(1:nrow(M), 1), ]           # fetch one random row | ||||
|   if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state | ||||
|   return(r) | ||||
| } | ||||
| ``` | ||||
| <p>  | ||||
| <p> | ||||
| With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`. | ||||
|  | ||||
| _`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful. | ||||
|  | ||||
| <p>  | ||||
| <p> | ||||
|  | ||||
| Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up. | ||||
|  | ||||
| ```{r preProcess} | ||||
|  | ||||
| # select only single-word phobias that end with "phobia" | ||||
| sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia) | ||||
| names <- phobiaTable$Phobia[sel] | ||||
|  | ||||
| # extract the ones we did _not_ select | ||||
| x <- phobiaTable$Phobia[! sel] | ||||
| # use strsplit() to split them apart and flatten the resulting list | ||||
| x <- unlist(strsplit(x, ", ")) | ||||
| x <- unlist(strsplit(x, " ")) | ||||
| x <- unlist(strsplit(x, "/")) | ||||
| # use the same selection as above, and append the result to our "names"" | ||||
| sel <- ! grepl(" ", x) & grepl(".phobia$", x) | ||||
| names <- c(names, x[sel]) | ||||
|  | ||||
| ``` | ||||
|  | ||||
| Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths. | ||||
|  | ||||
| ```{r showHist} | ||||
|  | ||||
| x <- nchar(names) | ||||
| pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ... | ||||
| pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too. | ||||
| hist(x, | ||||
|      main = "Length of phobia-names", | ||||
|      sub = sprintf("Shortest: %s (%d), Longest: %s (%d)", | ||||
|                    pShort, nchar(pShort), pLong, nchar(pLong)), | ||||
|      cex.sub = 0.8, | ||||
|      xlab = "name", | ||||
|      ylab = "counts", | ||||
|      col ="#aef5ee") | ||||
|  | ||||
| ``` | ||||
|  | ||||
| That's all. | ||||
|  | ||||
| <!-- [END] --> | ||||
|   | ||||
| @@ -1,43 +1,43 @@ | ||||
| >MBP1 YDL056W SGDID:S000002214 | ||||
| ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT | ||||
| TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA | ||||
| AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG | ||||
| GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG | ||||
| AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC | ||||
| GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC | ||||
| TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA | ||||
| AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT | ||||
| CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG | ||||
| AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA | ||||
| CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA | ||||
| TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA | ||||
| CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG | ||||
| GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA | ||||
| CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT | ||||
| CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT | ||||
| CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG | ||||
| CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT | ||||
| TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT | ||||
| CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT | ||||
| TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT | ||||
| ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT | ||||
| TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT | ||||
| ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT | ||||
| TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT | ||||
| AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT | ||||
| TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT | ||||
| ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG | ||||
| ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC | ||||
| GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT | ||||
| GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT | ||||
| ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA | ||||
| CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA | ||||
| ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC | ||||
| GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG | ||||
| AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG | ||||
| CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA | ||||
| GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG | ||||
| CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG | ||||
| ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT | ||||
| AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA | ||||
| >MBP1 YDL056W SGDID:S000002214 | ||||
| ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT | ||||
| TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA | ||||
| AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG | ||||
| GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG | ||||
| AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC | ||||
| GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC | ||||
| TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA | ||||
| AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT | ||||
| CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG | ||||
| AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA | ||||
| CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA | ||||
| TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA | ||||
| CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG | ||||
| GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA | ||||
| CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT | ||||
| CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT | ||||
| CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG | ||||
| CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT | ||||
| TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT | ||||
| CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT | ||||
| TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT | ||||
| ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT | ||||
| TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT | ||||
| ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT | ||||
| TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT | ||||
| AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT | ||||
| TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT | ||||
| ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG | ||||
| ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC | ||||
| GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT | ||||
| GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT | ||||
| ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA | ||||
| CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA | ||||
| ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC | ||||
| GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG | ||||
| AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG | ||||
| CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA | ||||
| GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG | ||||
| CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG | ||||
| ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT | ||||
| AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA | ||||
| GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA | ||||
| @@ -1,47 +1,47 @@ | ||||
| SGD_features.tab | ||||
|  | ||||
| The latest version of the SGD_features.tab file is based on Genome Version R64-2-1. | ||||
|  | ||||
| The SGD_features.tab file is updated weekly (Saturday). | ||||
|  | ||||
| NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously | ||||
| used chromosomal_feature.tab file. | ||||
|  | ||||
| File contents: | ||||
|  | ||||
| 1. Information on current chromosomal features in SGD, including Dubious ORFs.  | ||||
| Also contains coordinates of intron, exons, and other subfeatures that are located | ||||
| within a chromosomal feature. | ||||
|  | ||||
| 2. The relationship between subfeatures and the feature in which they | ||||
| are located is identified by the feature name in column #7 (parent | ||||
| feature). For example, the parent feature of the intron found in | ||||
| ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is | ||||
| chromosome 6. | ||||
|  | ||||
| 3. The coordinates of all features are in chromosomal coordinates. | ||||
|  | ||||
|  | ||||
| Columns within SGD_features.tab: | ||||
|  | ||||
| 1.   Primary SGDID (mandatory) | ||||
| 2.   Feature type (mandatory) | ||||
| 3.   Feature qualifier (optional) | ||||
| 4.   Feature name (optional) | ||||
| 5.   Standard gene name (optional) | ||||
| 6.   Alias (optional, multiples separated by |) | ||||
| 7.   Parent feature name (optional) | ||||
| 8.   Secondary SGDID (optional, multiples separated by |) | ||||
| 9.   Chromosome (optional) | ||||
| 10.  Start_coordinate (optional) | ||||
| 11.  Stop_coordinate (optional) | ||||
| 12.  Strand (optional) | ||||
| 13.  Genetic position (optional) | ||||
| 14.  Coordinate version (optional) | ||||
| 15.  Sequence version (optional) | ||||
| 16.  Description (optional) | ||||
|  | ||||
| Note that "chromosome 17" is the mitochondrial chromosome. | ||||
|  | ||||
| The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff | ||||
|  | ||||
| SGD_features.tab | ||||
|  | ||||
| The latest version of the SGD_features.tab file is based on Genome Version R64-2-1. | ||||
|  | ||||
| The SGD_features.tab file is updated weekly (Saturday). | ||||
|  | ||||
| NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously | ||||
| used chromosomal_feature.tab file. | ||||
|  | ||||
| File contents: | ||||
|  | ||||
| 1. Information on current chromosomal features in SGD, including Dubious ORFs.  | ||||
| Also contains coordinates of intron, exons, and other subfeatures that are located | ||||
| within a chromosomal feature. | ||||
|  | ||||
| 2. The relationship between subfeatures and the feature in which they | ||||
| are located is identified by the feature name in column #7 (parent | ||||
| feature). For example, the parent feature of the intron found in | ||||
| ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is | ||||
| chromosome 6. | ||||
|  | ||||
| 3. The coordinates of all features are in chromosomal coordinates. | ||||
|  | ||||
|  | ||||
| Columns within SGD_features.tab: | ||||
|  | ||||
| 1.   Primary SGDID (mandatory) | ||||
| 2.   Feature type (mandatory) | ||||
| 3.   Feature qualifier (optional) | ||||
| 4.   Feature name (optional) | ||||
| 5.   Standard gene name (optional) | ||||
| 6.   Alias (optional, multiples separated by |) | ||||
| 7.   Parent feature name (optional) | ||||
| 8.   Secondary SGDID (optional, multiples separated by |) | ||||
| 9.   Chromosome (optional) | ||||
| 10.  Start_coordinate (optional) | ||||
| 11.  Stop_coordinate (optional) | ||||
| 12.  Strand (optional) | ||||
| 13.  Genetic position (optional) | ||||
| 14.  Coordinate version (optional) | ||||
| 15.  Sequence version (optional) | ||||
| 16.  Description (optional) | ||||
|  | ||||
| Note that "chromosome 17" is the mitochondrial chromosome. | ||||
|  | ||||
| The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff | ||||
|  | ||||
|   | ||||
							
								
								
									
										32908
									
								
								data/SGD_features.tab
									
									
									
									
									
								
							
							
						
						
									
										32908
									
								
								data/SGD_features.tab
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										2030
									
								
								data/Species.csv
									
									
									
									
									
								
							
							
						
						
									
										2030
									
								
								data/Species.csv
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,179 +1,179 @@ | ||||
| MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | ||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936 | ||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334 | ||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078 | ||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131 | ||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936 | ||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334 | ||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131 | ||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078 | ||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131 | ||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078 | ||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334 | ||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936 | ||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334 | ||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131 | ||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078 | ||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936 | ||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936 | ||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078 | ||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131 | ||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334 | ||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078 | ||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936 | ||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334 | ||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131 | ||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131 | ||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936 | ||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334 | ||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078 | ||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131 | ||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936 | ||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334 | ||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078 | ||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936 | ||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131 | ||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334 | ||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078 | ||||
| 11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936 | ||||
| 11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078 | ||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334 | ||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936 | ||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131 | ||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078 | ||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334 | ||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131 | ||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936 | ||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078 | ||||
| 7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078 | ||||
| 7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078 | ||||
| 7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936 | ||||
| 7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936 | ||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936 | ||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078 | ||||
| 5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936 | ||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334 | ||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131 | ||||
| 5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078 | ||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078 | ||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334 | ||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936 | ||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131 | ||||
| 3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936 | ||||
| 3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936 | ||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078 | ||||
| 3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936 | ||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936 | ||||
| 3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936 | ||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936 | ||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131 | ||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334 | ||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334 | ||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131 | ||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131 | ||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936 | ||||
| 2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936 | ||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334 | ||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131 | ||||
| 2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936 | ||||
| 2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078 | ||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078 | ||||
| 2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078 | ||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936 | ||||
| 2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936 | ||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078 | ||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334 | ||||
| 2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334 | ||||
| 1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334 | ||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334 | ||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078 | ||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078 | ||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936 | ||||
| 0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936 | ||||
| 0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131 | ||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131 | ||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131 | ||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131 | ||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334 | ||||
| 0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936 | ||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936 | ||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936 | ||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078 | ||||
| MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | ||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936 | ||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334 | ||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078 | ||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131 | ||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936 | ||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334 | ||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131 | ||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078 | ||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131 | ||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078 | ||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334 | ||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936 | ||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334 | ||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131 | ||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078 | ||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936 | ||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936 | ||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078 | ||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131 | ||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334 | ||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078 | ||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936 | ||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334 | ||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131 | ||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131 | ||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936 | ||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334 | ||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078 | ||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131 | ||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936 | ||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334 | ||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078 | ||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936 | ||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131 | ||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334 | ||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078 | ||||
| 11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936 | ||||
| 11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078 | ||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334 | ||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936 | ||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131 | ||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078 | ||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334 | ||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131 | ||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936 | ||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078 | ||||
| 7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078 | ||||
| 7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078 | ||||
| 7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936 | ||||
| 7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936 | ||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936 | ||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078 | ||||
| 5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936 | ||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334 | ||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131 | ||||
| 5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078 | ||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078 | ||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334 | ||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936 | ||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131 | ||||
| 3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936 | ||||
| 3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936 | ||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078 | ||||
| 3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078 | ||||
| 3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936 | ||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936 | ||||
| 3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936 | ||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936 | ||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131 | ||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334 | ||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334 | ||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131 | ||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131 | ||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936 | ||||
| 2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936 | ||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334 | ||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131 | ||||
| 2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936 | ||||
| 2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078 | ||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078 | ||||
| 2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078 | ||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936 | ||||
| 2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936 | ||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078 | ||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334 | ||||
| 2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334 | ||||
| 1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334 | ||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334 | ||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078 | ||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078 | ||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936 | ||||
| 0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078 | ||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936 | ||||
| 0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131 | ||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131 | ||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131 | ||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131 | ||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334 | ||||
| 0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334 | ||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334 | ||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131 | ||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936 | ||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936 | ||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936 | ||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936 | ||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936 | ||||
| 1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078 | ||||
| 1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078 | ||||
|   | ||||
| 
 | 
| @@ -1,49 +1,49 @@ | ||||
| MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | ||||
| 2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094 | ||||
| 2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094 | ||||
| 1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094 | ||||
| 1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094 | ||||
| 1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094 | ||||
| MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | ||||
| 2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094 | ||||
| 2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094 | ||||
| 1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094 | ||||
| 1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094 | ||||
| 1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094 | ||||
| 0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094 | ||||
| 1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094 | ||||
|   | ||||
| 
 | 
| @@ -1,113 +1,113 @@ | ||||
| MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | ||||
| 5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677 | ||||
| 4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677 | ||||
| 3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597 | ||||
| 3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597 | ||||
| 2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597 | ||||
| 2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597 | ||||
| 2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597 | ||||
| 2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818 | ||||
| 1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597 | ||||
| 1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597 | ||||
| 1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818 | ||||
| 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818 | ||||
| 1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818 | ||||
| 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818 | ||||
| 1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597 | ||||
| 1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818 | ||||
| 1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677 | ||||
| 1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677 | ||||
| 1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677 | ||||
| 1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677 | ||||
| MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | ||||
| 5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677 | ||||
| 4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677 | ||||
| 3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597 | ||||
| 3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597 | ||||
| 2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597 | ||||
| 2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597 | ||||
| 2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597 | ||||
| 2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677 | ||||
| 2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818 | ||||
| 1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597 | ||||
| 1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597 | ||||
| 1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818 | ||||
| 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818 | ||||
| 1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818 | ||||
| 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818 | ||||
| 1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597 | ||||
| 1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818 | ||||
| 1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597 | ||||
| 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677 | ||||
| 1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597 | ||||
| 1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677 | ||||
| 0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677 | ||||
| 1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677 | ||||
| 1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677 | ||||
| 1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677 | ||||
|   | ||||
| 
 | 
| @@ -1,39 +1,39 @@ | ||||
| >MBP1_ASPNI AN3154 XP_660758 Q5B8H6 | ||||
| -VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI | ||||
| LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY | ||||
|  | ||||
| >MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86 | ||||
| KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI | ||||
| LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY | ||||
|  | ||||
| >MBP1_COPCI  - XP_001837394 A8NYC6 | ||||
| QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV | ||||
| LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF | ||||
|  | ||||
| >MBP1_CRYNE  - XP_569090 Q5KMQ9 | ||||
| DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA | ||||
| LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV | ||||
|  | ||||
| >MBP1_NEUCR Swi4 XP_955821 Q7RW59 | ||||
| -IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI | ||||
| LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF | ||||
|  | ||||
| >MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4 | ||||
| -IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV | ||||
| LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF | ||||
|  | ||||
| >MBP1_SACCE Mbp1 NP_010227 P39678 | ||||
| QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI | ||||
| LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF | ||||
|  | ||||
| >MBP1_SCHPO Res2 NP_593032 P41412 | ||||
| -VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV | ||||
| LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS- | ||||
|  | ||||
| >MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35 | ||||
| -IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV | ||||
| LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY | ||||
|  | ||||
| >MBP1_WALME  - XP_006957051 I4YGC0 | ||||
| -IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI | ||||
| LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY | ||||
| >MBP1_ASPNI AN3154 XP_660758 Q5B8H6 | ||||
| -VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI | ||||
| LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY | ||||
|  | ||||
| >MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86 | ||||
| KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI | ||||
| LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY | ||||
|  | ||||
| >MBP1_COPCI  - XP_001837394 A8NYC6 | ||||
| QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV | ||||
| LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF | ||||
|  | ||||
| >MBP1_CRYNE  - XP_569090 Q5KMQ9 | ||||
| DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA | ||||
| LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV | ||||
|  | ||||
| >MBP1_NEUCR Swi4 XP_955821 Q7RW59 | ||||
| -IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI | ||||
| LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF | ||||
|  | ||||
| >MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4 | ||||
| -IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV | ||||
| LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF | ||||
|  | ||||
| >MBP1_SACCE Mbp1 NP_010227 P39678 | ||||
| QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI | ||||
| LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF | ||||
|  | ||||
| >MBP1_SCHPO Res2 NP_593032 P41412 | ||||
| -VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV | ||||
| LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS- | ||||
|  | ||||
| >MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35 | ||||
| -IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV | ||||
| LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY | ||||
|  | ||||
| >MBP1_WALME  - XP_006957051 I4YGC0 | ||||
| -IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI | ||||
| LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY | ||||
|   | ||||
| @@ -1,490 +1,490 @@ | ||||
| [ | ||||
|   { "name" : "68476_WALME", | ||||
|     "RefSeqID" : "XP_006957790", | ||||
|     "UniProtID" : "I4YDD8", | ||||
|     "taxonomyID" : "671144", | ||||
|     "sequence" : [ | ||||
|              "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ", | ||||
|              "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER", | ||||
|              "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG", | ||||
|              "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE", | ||||
|              "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA", | ||||
|              "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD", | ||||
|              "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE", | ||||
|              "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE", | ||||
|              "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"] | ||||
|   }, | ||||
|   { "name" : "00846_COPCI", | ||||
|     "RefSeqID" : "XP_001831299", | ||||
|     "UniProtID" : "A8N8X1", | ||||
|     "taxonomyID" : "240176", | ||||
|     "sequence" : [ | ||||
|              "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG", | ||||
|              "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS", | ||||
|              "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH", | ||||
|              "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP", | ||||
|              "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF", | ||||
|              "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK", | ||||
|              "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN", | ||||
|              "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT", | ||||
|              "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS", | ||||
|              "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE", | ||||
|              "SEAQVVDIGRVSGFMQKVRDGII"] | ||||
|   }, | ||||
|   { "name" : "8533_BIPOR", | ||||
|     "RefSeqID" : "XP_007691662", | ||||
|     "UniProtID" : "W6ZE71", | ||||
|     "taxonomyID" : "930090", | ||||
|     "sequence" : [ | ||||
|              "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR", | ||||
|              "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS", | ||||
|              "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT", | ||||
|              "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT", | ||||
|              "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT", | ||||
|              "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA", | ||||
|              "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME", | ||||
|              "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID", | ||||
|              "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE", | ||||
|              "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML", | ||||
|              "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"] | ||||
|   }, | ||||
|   { "name" : "PGTG_02039", | ||||
|     "RefSeqID" : "XP_003320997", | ||||
|     "UniProtID" : "E3JX03", | ||||
|     "taxonomyID" : "418459", | ||||
|     "sequence" : [ | ||||
|              "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV", | ||||
|              "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS", | ||||
|              "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP", | ||||
|              "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP", | ||||
|              "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL", | ||||
|              "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH", | ||||
|              "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI", | ||||
|              "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN", | ||||
|              "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC", | ||||
|              "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK", | ||||
|              "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"] | ||||
|   }, | ||||
|   { "name" : "MBPA_ASPNI", | ||||
|     "RefSeqID" : "XP_664319", | ||||
|     "UniProtID" : "Q5AYB5", | ||||
|     "taxonomyID" : "227321", | ||||
|     "sequence" : [ | ||||
|              "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG", | ||||
|              "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK", | ||||
|              "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP", | ||||
|              "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG", | ||||
|              "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER", | ||||
|              "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK", | ||||
|              "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS", | ||||
|              "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA", | ||||
|              "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS", | ||||
|              "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP", | ||||
|              "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL", | ||||
|              "MRDRGQDW"] | ||||
|   }, | ||||
|   { "name" : "05520_CRYNE", | ||||
|     "RefSeqID" : "XP_570545", | ||||
|     "UniProtID" : "Q5KHS0", | ||||
|     "taxonomyID" : "214684", | ||||
|     "sequence" : [ | ||||
|              "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH", | ||||
|              "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART", | ||||
|              "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL", | ||||
|              "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM", | ||||
|              "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL", | ||||
|              "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS", | ||||
|              "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG", | ||||
|              "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD", | ||||
|              "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT", | ||||
|              "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP", | ||||
|              "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS", | ||||
|              "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"] | ||||
|   }, | ||||
|   { "name" : "RES1_SCHPO", | ||||
|     "RefSeqID" : "NP_595496", | ||||
|     "UniProtID" : "P33520", | ||||
|     "taxonomyID" : "284812", | ||||
|     "sequence" : [ | ||||
|              "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP", | ||||
|              "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS", | ||||
|              "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA", | ||||
|              "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS", | ||||
|              "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS", | ||||
|              "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL", | ||||
|              "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA", | ||||
|              "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"] | ||||
|   }, | ||||
|   { "name" : "CDC10_SCHPO", | ||||
|     "RefSeqID" : "NP_596132", | ||||
|     "UniProtID" : "P01129", | ||||
|     "taxonomyID" : "284812", | ||||
|     "sequence" : [ | ||||
|              "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL", | ||||
|              "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP", | ||||
|              "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ", | ||||
|              "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS", | ||||
|              "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV", | ||||
|              "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ", | ||||
|              "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR", | ||||
|              "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ", | ||||
|              "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS", | ||||
|              "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"] | ||||
|   }, | ||||
|   { "name" : "05338_USTMA", | ||||
|     "RefSeqID" : "XP_011392041", | ||||
|     "UniProtID" : "A0A0D1BWD8", | ||||
|     "taxonomyID" : "237631", | ||||
|     "sequence" : [ | ||||
|              "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT", | ||||
|              "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI", | ||||
|              "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR", | ||||
|              "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL", | ||||
|              "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA", | ||||
|              "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI", | ||||
|              "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA", | ||||
|              "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT", | ||||
|              "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL", | ||||
|              "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL", | ||||
|              "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN", | ||||
|              "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG", | ||||
|              "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP", | ||||
|              "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"] | ||||
|   }, | ||||
|   { "name" : "SWI4_SACCE", | ||||
|     "RefSeqID" : "NP_011036", | ||||
|     "UniProtID" : "P25302", | ||||
|     "taxonomyID" : "559292", | ||||
|     "sequence" : [ | ||||
|              "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF", | ||||
|              "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP", | ||||
|              "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN", | ||||
|              "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS", | ||||
|              "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK", | ||||
|              "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY", | ||||
|              "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK", | ||||
|              "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN", | ||||
|              "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ", | ||||
|              "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS", | ||||
|              "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV", | ||||
|              "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL", | ||||
|              "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL", | ||||
|              "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"] | ||||
|   }, | ||||
|   { "name" : "SWI6_NEUCR", | ||||
|     "RefSeqID" : "XP_962967", | ||||
|     "UniProtID" : "Q7SBG9", | ||||
|     "taxonomyID" : "367110", | ||||
|     "sequence" : [ | ||||
|              "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN", | ||||
|              "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT", | ||||
|              "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF", | ||||
|              "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT", | ||||
|              "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA", | ||||
|              "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV", | ||||
|              "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG", | ||||
|              "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL", | ||||
|              "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT", | ||||
|              "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES", | ||||
|              "EKPELEIARVRRFLGGVEGVVH"] | ||||
|   }, | ||||
|   { "name" : "15042_USTMA", | ||||
|     "RefSeqID" : "XP_011388143", | ||||
|     "UniProtID" : "A0A0D1CVS5", | ||||
|     "taxonomyID" : "237631", | ||||
|     "sequence" : [ | ||||
|              "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA", | ||||
|              "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN", | ||||
|              "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE", | ||||
|              "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI", | ||||
|              "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS", | ||||
|              "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT", | ||||
|              "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR", | ||||
|              "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"] | ||||
|   }, | ||||
|   { "name" : "04778_USTMA", | ||||
|     "RefSeqID" : "XP_011391646", | ||||
|     "UniProtID" : "A0A0D1DQM4", | ||||
|     "taxonomyID" : "237631", | ||||
|     "sequence" : [ | ||||
|              "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK", | ||||
|              "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS", | ||||
|              "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG", | ||||
|              "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG", | ||||
|              "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD", | ||||
|              "PESAQLFTIHDFGSDPFYAEQVERG"] | ||||
|   }, | ||||
|   { "name" : "STUA_ASPNI", | ||||
|     "RefSeqID" : "XP_663440", | ||||
|     "UniProtID" : "P36011", | ||||
|     "taxonomyID" : "227321", | ||||
|     "sequence" : [ | ||||
|              "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS", | ||||
|              "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM", | ||||
|              "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ", | ||||
|              "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL", | ||||
|              "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR", | ||||
|              "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP", | ||||
|              "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD", | ||||
|              "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"] | ||||
|   }, | ||||
|   { "name" : "STUA_NEUCR", | ||||
|     "RefSeqID" : "XP_960837", | ||||
|     "UniProtID" : "Q1K6U0", | ||||
|     "taxonomyID" : "367110", | ||||
|     "sequence" : [ | ||||
|              "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG", | ||||
|              "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT", | ||||
|              "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK", | ||||
|              "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG", | ||||
|              "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ", | ||||
|              "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV", | ||||
|              "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM", | ||||
|              "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH", | ||||
|              "RRR"] | ||||
|   }, | ||||
|   { "name" : "PHD1_SACCE", | ||||
|     "RefSeqID" : "NP_012881", | ||||
|     "UniProtID" : "P36093", | ||||
|     "taxonomyID" : "559292", | ||||
|     "sequence" : [ | ||||
|              "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA", | ||||
|              "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS", | ||||
|              "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS", | ||||
|              "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI", | ||||
|              "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"] | ||||
|   }, | ||||
|   { "name" : "08099_COPCI", | ||||
|     "RefSeqID" : "XP_001836714", | ||||
|     "UniProtID" : "A8NVH3", | ||||
|     "taxonomyID" : "240176", | ||||
|     "sequence" : [ | ||||
|              "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS", | ||||
|              "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS", | ||||
|              "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS", | ||||
|              "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV", | ||||
|              "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ", | ||||
|              "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA", | ||||
|              "PHRPW"] | ||||
|   }, | ||||
|   { "name" : "68479_WALME", | ||||
|     "RefSeqID" : "XP_006957792", | ||||
|     "UniProtID" : "I4YDE0", | ||||
|     "taxonomyID" : "671144", | ||||
|     "sequence" : [ | ||||
|              "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF", | ||||
|              "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS", | ||||
|              "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS", | ||||
|              "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"] | ||||
|   }, | ||||
|   { "name" : "11943_PUCGR", | ||||
|     "RefSeqID" : "XP_003330006", | ||||
|     "UniProtID" : "E3KMR2", | ||||
|     "taxonomyID" : "418459", | ||||
|     "sequence" : [ | ||||
|              "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV", | ||||
|              "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP", | ||||
|              "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA", | ||||
|              "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS", | ||||
|              "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN", | ||||
|              "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA", | ||||
|              "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL", | ||||
|              "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"] | ||||
|   }, | ||||
|   { "name" : "03082_PUCGR", | ||||
|     "RefSeqID" : "XP_003321545", | ||||
|     "UniProtID" : "E3JYK1", | ||||
|     "taxonomyID" : "418459", | ||||
|     "sequence" : [ | ||||
|              "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV", | ||||
|              "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP", | ||||
|              "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL", | ||||
|              "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK", | ||||
|              "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP", | ||||
|              "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA", | ||||
|              "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD", | ||||
|              "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR", | ||||
|              "LIMEWNPSC"] | ||||
|   }, | ||||
|   { "name" : "SOK2_SACCE", | ||||
|     "RefSeqID" : "NP_013729", | ||||
|     "UniProtID" : "P53438", | ||||
|     "taxonomyID" : "559292", | ||||
|     "sequence" : [ | ||||
|              "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS", | ||||
|              "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS", | ||||
|              "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP", | ||||
|              "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY", | ||||
|              "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG", | ||||
|              "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM", | ||||
|              "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS", | ||||
|              "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA", | ||||
|              "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ", | ||||
|              "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"] | ||||
|   }, | ||||
|   { "name" : "14426_COPCI", | ||||
|     "RefSeqID" : "XP_002911429", | ||||
|     "UniProtID" : "D6RMB0", | ||||
|     "taxonomyID" : "240176", | ||||
|     "sequence" : [ | ||||
|              "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA", | ||||
|              "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP", | ||||
|              "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT", | ||||
|              "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK", | ||||
|              "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA", | ||||
|              "ITYLPNFL"] | ||||
|   }, | ||||
|   { "name" : "BQT4_SCHPO", | ||||
|     "RefSeqID" : "NP_596166", | ||||
|     "UniProtID" : "O60158", | ||||
|     "taxonomyID" : "284812", | ||||
|     "sequence" : [ | ||||
|              "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA", | ||||
|              "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG", | ||||
|              "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS", | ||||
|              "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN", | ||||
|              "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK", | ||||
|              "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"] | ||||
|   }, | ||||
|   { "name" : "PGTG_05590", | ||||
|     "RefSeqID" : "XP_003323688", | ||||
|     "UniProtID" : "E3K4V4", | ||||
|     "taxonomyID" : "418459", | ||||
|     "sequence" : [ | ||||
|              "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT", | ||||
|              "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS", | ||||
|              "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS", | ||||
|              "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND", | ||||
|              "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN", | ||||
|              "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG", | ||||
|              "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"] | ||||
|   }, | ||||
|   { "name" : "06560_NEUCR", | ||||
|     "RefSeqID" : "XP_962267", | ||||
|     "UniProtID" : "Q7S9H5", | ||||
|     "taxonomyID" : "367110", | ||||
|     "sequence" : [ | ||||
|              "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP", | ||||
|              "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP", | ||||
|              "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT", | ||||
|              "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV", | ||||
|              "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE", | ||||
|              "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV", | ||||
|              "ANVL"] | ||||
|   }, | ||||
|   { "name" : "81480_BIPOR", | ||||
|     "RefSeqID" : "XP_007682909", | ||||
|     "UniProtID" : "W6ZKJ4", | ||||
|     "taxonomyID" : "930090", | ||||
|     "sequence" : [ | ||||
|              "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN", | ||||
|              "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI", | ||||
|              "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP", | ||||
|              "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE", | ||||
|              "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK", | ||||
|              "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"] | ||||
|   }, | ||||
|   { "name" : "01622_ASPNI", | ||||
|     "RefSeqID" : "XP_657766", | ||||
|     "UniProtID" : "Q5BH18", | ||||
|     "taxonomyID" : "227321", | ||||
|     "sequence" : [ | ||||
|              "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP", | ||||
|              "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA", | ||||
|              "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL", | ||||
|              "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV", | ||||
|              "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE", | ||||
|              "RVRNRALMGVTAAFALAKPALVLLEA"] | ||||
|   }, | ||||
|   { "name" : "05405_ASPNI", | ||||
|     "RefSeqID" : "XP_663009", | ||||
|     "UniProtID" : "Q5B225", | ||||
|     "taxonomyID" : "227321", | ||||
|     "sequence" : [ | ||||
|              "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD", | ||||
|              "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM", | ||||
|              "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA", | ||||
|              "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS", | ||||
|              "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP", | ||||
|              "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR", | ||||
|              "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD", | ||||
|              "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"] | ||||
|   }, | ||||
|   { "name" : "105954_BIPOR", | ||||
|     "RefSeqID" : "XP_007691967", | ||||
|     "UniProtID" : "W6Z1H5", | ||||
|     "taxonomyID" : "930090", | ||||
|     "sequence" : [ | ||||
|              "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI", | ||||
|              "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR", | ||||
|              "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE", | ||||
|              "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP", | ||||
|              "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV", | ||||
|              "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"] | ||||
|   }, | ||||
|   { "name" : "69819_WALME", | ||||
|     "RefSeqID" : "XP_006959479", | ||||
|     "UniProtID" : "I4Y911", | ||||
|     "taxonomyID" : "671144", | ||||
|     "sequence" : [ | ||||
|              "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI", | ||||
|              "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS", | ||||
|              "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL", | ||||
|              "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ", | ||||
|              "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"] | ||||
|   }, | ||||
|   { "name" : "02840_CRYNE", | ||||
|     "RefSeqID" : "XP_568872", | ||||
|     "UniProtID" : "Q5KM59", | ||||
|     "taxonomyID" : "214684", | ||||
|     "sequence" : [ | ||||
|              "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG", | ||||
|              "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV", | ||||
|              "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT", | ||||
|              "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA", | ||||
|              "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"] | ||||
|   }, | ||||
|   { "name" : "11055_USTMA", | ||||
|     "RefSeqID" : "XP_011390537", | ||||
|     "UniProtID" : "A0A0D1DZM8", | ||||
|     "taxonomyID" : "237631", | ||||
|     "sequence" : [ | ||||
|              "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK", | ||||
|              "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF", | ||||
|              "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV", | ||||
|              "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR", | ||||
|              "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV", | ||||
|              "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL", | ||||
|              "ASILPW"] | ||||
|   }, | ||||
|   { "name" : "XBP1_NEUCR", | ||||
|     "RefSeqID" : "XP_962373", | ||||
|     "UniProtID" : "Q7S9W7", | ||||
|     "taxonomyID" : "367110", | ||||
|     "sequence" : [ | ||||
|              "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT", | ||||
|              "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR", | ||||
|              "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH", | ||||
|              "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP", | ||||
|              "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ", | ||||
|              "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT", | ||||
|              "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT", | ||||
|              "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD", | ||||
|              "DEGDYEHEQQYRRKRRRLLLVGRAKSF"] | ||||
|   }, | ||||
|   { "name" : "XBP1_SACCE", | ||||
|     "RefSeqID" : "NP_012165", | ||||
|     "UniProtID" : "P40489", | ||||
|     "taxonomyID" : "559292", | ||||
|     "sequence" : [ | ||||
|              "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA", | ||||
|              "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR", | ||||
|              "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE", | ||||
|              "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS", | ||||
|              "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ", | ||||
|              "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND", | ||||
|              "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY", | ||||
|              "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK", | ||||
|              "FKTNSKQ"] | ||||
|   } | ||||
| ] | ||||
| [ | ||||
|   { "name" : "68476_WALME", | ||||
|     "RefSeqID" : "XP_006957790", | ||||
|     "UniProtID" : "I4YDD8", | ||||
|     "taxonomyID" : "671144", | ||||
|     "sequence" : [ | ||||
|              "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ", | ||||
|              "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER", | ||||
|              "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG", | ||||
|              "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE", | ||||
|              "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA", | ||||
|              "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD", | ||||
|              "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE", | ||||
|              "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE", | ||||
|              "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"] | ||||
|   }, | ||||
|   { "name" : "00846_COPCI", | ||||
|     "RefSeqID" : "XP_001831299", | ||||
|     "UniProtID" : "A8N8X1", | ||||
|     "taxonomyID" : "240176", | ||||
|     "sequence" : [ | ||||
|              "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG", | ||||
|              "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS", | ||||
|              "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH", | ||||
|              "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP", | ||||
|              "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF", | ||||
|              "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK", | ||||
|              "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN", | ||||
|              "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT", | ||||
|              "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS", | ||||
|              "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE", | ||||
|              "SEAQVVDIGRVSGFMQKVRDGII"] | ||||
|   }, | ||||
|   { "name" : "8533_BIPOR", | ||||
|     "RefSeqID" : "XP_007691662", | ||||
|     "UniProtID" : "W6ZE71", | ||||
|     "taxonomyID" : "930090", | ||||
|     "sequence" : [ | ||||
|              "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR", | ||||
|              "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS", | ||||
|              "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT", | ||||
|              "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT", | ||||
|              "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT", | ||||
|              "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA", | ||||
|              "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME", | ||||
|              "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID", | ||||
|              "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE", | ||||
|              "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML", | ||||
|              "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"] | ||||
|   }, | ||||
|   { "name" : "PGTG_02039", | ||||
|     "RefSeqID" : "XP_003320997", | ||||
|     "UniProtID" : "E3JX03", | ||||
|     "taxonomyID" : "418459", | ||||
|     "sequence" : [ | ||||
|              "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV", | ||||
|              "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS", | ||||
|              "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP", | ||||
|              "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP", | ||||
|              "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL", | ||||
|              "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH", | ||||
|              "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI", | ||||
|              "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN", | ||||
|              "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC", | ||||
|              "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK", | ||||
|              "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"] | ||||
|   }, | ||||
|   { "name" : "MBPA_ASPNI", | ||||
|     "RefSeqID" : "XP_664319", | ||||
|     "UniProtID" : "Q5AYB5", | ||||
|     "taxonomyID" : "227321", | ||||
|     "sequence" : [ | ||||
|              "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG", | ||||
|              "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK", | ||||
|              "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP", | ||||
|              "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG", | ||||
|              "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER", | ||||
|              "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK", | ||||
|              "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS", | ||||
|              "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA", | ||||
|              "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS", | ||||
|              "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP", | ||||
|              "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL", | ||||
|              "MRDRGQDW"] | ||||
|   }, | ||||
|   { "name" : "05520_CRYNE", | ||||
|     "RefSeqID" : "XP_570545", | ||||
|     "UniProtID" : "Q5KHS0", | ||||
|     "taxonomyID" : "214684", | ||||
|     "sequence" : [ | ||||
|              "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH", | ||||
|              "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART", | ||||
|              "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL", | ||||
|              "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM", | ||||
|              "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL", | ||||
|              "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS", | ||||
|              "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG", | ||||
|              "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD", | ||||
|              "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT", | ||||
|              "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP", | ||||
|              "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS", | ||||
|              "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"] | ||||
|   }, | ||||
|   { "name" : "RES1_SCHPO", | ||||
|     "RefSeqID" : "NP_595496", | ||||
|     "UniProtID" : "P33520", | ||||
|     "taxonomyID" : "284812", | ||||
|     "sequence" : [ | ||||
|              "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP", | ||||
|              "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS", | ||||
|              "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA", | ||||
|              "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS", | ||||
|              "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS", | ||||
|              "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL", | ||||
|              "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA", | ||||
|              "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"] | ||||
|   }, | ||||
|   { "name" : "CDC10_SCHPO", | ||||
|     "RefSeqID" : "NP_596132", | ||||
|     "UniProtID" : "P01129", | ||||
|     "taxonomyID" : "284812", | ||||
|     "sequence" : [ | ||||
|              "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL", | ||||
|              "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP", | ||||
|              "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ", | ||||
|              "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS", | ||||
|              "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV", | ||||
|              "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ", | ||||
|              "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR", | ||||
|              "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ", | ||||
|              "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS", | ||||
|              "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"] | ||||
|   }, | ||||
|   { "name" : "05338_USTMA", | ||||
|     "RefSeqID" : "XP_011392041", | ||||
|     "UniProtID" : "A0A0D1BWD8", | ||||
|     "taxonomyID" : "237631", | ||||
|     "sequence" : [ | ||||
|              "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT", | ||||
|              "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI", | ||||
|              "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR", | ||||
|              "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL", | ||||
|              "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA", | ||||
|              "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI", | ||||
|              "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA", | ||||
|              "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT", | ||||
|              "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL", | ||||
|              "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL", | ||||
|              "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN", | ||||
|              "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG", | ||||
|              "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP", | ||||
|              "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"] | ||||
|   }, | ||||
|   { "name" : "SWI4_SACCE", | ||||
|     "RefSeqID" : "NP_011036", | ||||
|     "UniProtID" : "P25302", | ||||
|     "taxonomyID" : "559292", | ||||
|     "sequence" : [ | ||||
|              "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF", | ||||
|              "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP", | ||||
|              "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN", | ||||
|              "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS", | ||||
|              "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK", | ||||
|              "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY", | ||||
|              "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK", | ||||
|              "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN", | ||||
|              "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ", | ||||
|              "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS", | ||||
|              "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV", | ||||
|              "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL", | ||||
|              "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL", | ||||
|              "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"] | ||||
|   }, | ||||
|   { "name" : "SWI6_NEUCR", | ||||
|     "RefSeqID" : "XP_962967", | ||||
|     "UniProtID" : "Q7SBG9", | ||||
|     "taxonomyID" : "367110", | ||||
|     "sequence" : [ | ||||
|              "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN", | ||||
|              "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT", | ||||
|              "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF", | ||||
|              "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT", | ||||
|              "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA", | ||||
|              "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV", | ||||
|              "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG", | ||||
|              "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL", | ||||
|              "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT", | ||||
|              "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES", | ||||
|              "EKPELEIARVRRFLGGVEGVVH"] | ||||
|   }, | ||||
|   { "name" : "15042_USTMA", | ||||
|     "RefSeqID" : "XP_011388143", | ||||
|     "UniProtID" : "A0A0D1CVS5", | ||||
|     "taxonomyID" : "237631", | ||||
|     "sequence" : [ | ||||
|              "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA", | ||||
|              "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN", | ||||
|              "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE", | ||||
|              "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI", | ||||
|              "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS", | ||||
|              "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT", | ||||
|              "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR", | ||||
|              "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"] | ||||
|   }, | ||||
|   { "name" : "04778_USTMA", | ||||
|     "RefSeqID" : "XP_011391646", | ||||
|     "UniProtID" : "A0A0D1DQM4", | ||||
|     "taxonomyID" : "237631", | ||||
|     "sequence" : [ | ||||
|              "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK", | ||||
|              "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS", | ||||
|              "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG", | ||||
|              "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG", | ||||
|              "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD", | ||||
|              "PESAQLFTIHDFGSDPFYAEQVERG"] | ||||
|   }, | ||||
|   { "name" : "STUA_ASPNI", | ||||
|     "RefSeqID" : "XP_663440", | ||||
|     "UniProtID" : "P36011", | ||||
|     "taxonomyID" : "227321", | ||||
|     "sequence" : [ | ||||
|              "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS", | ||||
|              "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM", | ||||
|              "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ", | ||||
|              "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL", | ||||
|              "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR", | ||||
|              "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP", | ||||
|              "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD", | ||||
|              "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"] | ||||
|   }, | ||||
|   { "name" : "STUA_NEUCR", | ||||
|     "RefSeqID" : "XP_960837", | ||||
|     "UniProtID" : "Q1K6U0", | ||||
|     "taxonomyID" : "367110", | ||||
|     "sequence" : [ | ||||
|              "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG", | ||||
|              "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT", | ||||
|              "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK", | ||||
|              "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG", | ||||
|              "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ", | ||||
|              "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV", | ||||
|              "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM", | ||||
|              "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH", | ||||
|              "RRR"] | ||||
|   }, | ||||
|   { "name" : "PHD1_SACCE", | ||||
|     "RefSeqID" : "NP_012881", | ||||
|     "UniProtID" : "P36093", | ||||
|     "taxonomyID" : "559292", | ||||
|     "sequence" : [ | ||||
|              "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA", | ||||
|              "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS", | ||||
|              "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS", | ||||
|              "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI", | ||||
|              "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"] | ||||
|   }, | ||||
|   { "name" : "08099_COPCI", | ||||
|     "RefSeqID" : "XP_001836714", | ||||
|     "UniProtID" : "A8NVH3", | ||||
|     "taxonomyID" : "240176", | ||||
|     "sequence" : [ | ||||
|              "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS", | ||||
|              "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS", | ||||
|              "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS", | ||||
|              "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV", | ||||
|              "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ", | ||||
|              "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA", | ||||
|              "PHRPW"] | ||||
|   }, | ||||
|   { "name" : "68479_WALME", | ||||
|     "RefSeqID" : "XP_006957792", | ||||
|     "UniProtID" : "I4YDE0", | ||||
|     "taxonomyID" : "671144", | ||||
|     "sequence" : [ | ||||
|              "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF", | ||||
|              "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS", | ||||
|              "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS", | ||||
|              "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"] | ||||
|   }, | ||||
|   { "name" : "11943_PUCGR", | ||||
|     "RefSeqID" : "XP_003330006", | ||||
|     "UniProtID" : "E3KMR2", | ||||
|     "taxonomyID" : "418459", | ||||
|     "sequence" : [ | ||||
|              "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV", | ||||
|              "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP", | ||||
|              "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA", | ||||
|              "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS", | ||||
|              "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN", | ||||
|              "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA", | ||||
|              "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL", | ||||
|              "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"] | ||||
|   }, | ||||
|   { "name" : "03082_PUCGR", | ||||
|     "RefSeqID" : "XP_003321545", | ||||
|     "UniProtID" : "E3JYK1", | ||||
|     "taxonomyID" : "418459", | ||||
|     "sequence" : [ | ||||
|              "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV", | ||||
|              "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP", | ||||
|              "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL", | ||||
|              "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK", | ||||
|              "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP", | ||||
|              "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA", | ||||
|              "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD", | ||||
|              "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR", | ||||
|              "LIMEWNPSC"] | ||||
|   }, | ||||
|   { "name" : "SOK2_SACCE", | ||||
|     "RefSeqID" : "NP_013729", | ||||
|     "UniProtID" : "P53438", | ||||
|     "taxonomyID" : "559292", | ||||
|     "sequence" : [ | ||||
|              "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS", | ||||
|              "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS", | ||||
|              "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP", | ||||
|              "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY", | ||||
|              "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG", | ||||
|              "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM", | ||||
|              "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS", | ||||
|              "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA", | ||||
|              "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ", | ||||
|              "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"] | ||||
|   }, | ||||
|   { "name" : "14426_COPCI", | ||||
|     "RefSeqID" : "XP_002911429", | ||||
|     "UniProtID" : "D6RMB0", | ||||
|     "taxonomyID" : "240176", | ||||
|     "sequence" : [ | ||||
|              "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA", | ||||
|              "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP", | ||||
|              "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT", | ||||
|              "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK", | ||||
|              "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA", | ||||
|              "ITYLPNFL"] | ||||
|   }, | ||||
|   { "name" : "BQT4_SCHPO", | ||||
|     "RefSeqID" : "NP_596166", | ||||
|     "UniProtID" : "O60158", | ||||
|     "taxonomyID" : "284812", | ||||
|     "sequence" : [ | ||||
|              "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA", | ||||
|              "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG", | ||||
|              "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS", | ||||
|              "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN", | ||||
|              "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK", | ||||
|              "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"] | ||||
|   }, | ||||
|   { "name" : "PGTG_05590", | ||||
|     "RefSeqID" : "XP_003323688", | ||||
|     "UniProtID" : "E3K4V4", | ||||
|     "taxonomyID" : "418459", | ||||
|     "sequence" : [ | ||||
|              "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT", | ||||
|              "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS", | ||||
|              "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS", | ||||
|              "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND", | ||||
|              "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN", | ||||
|              "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG", | ||||
|              "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"] | ||||
|   }, | ||||
|   { "name" : "06560_NEUCR", | ||||
|     "RefSeqID" : "XP_962267", | ||||
|     "UniProtID" : "Q7S9H5", | ||||
|     "taxonomyID" : "367110", | ||||
|     "sequence" : [ | ||||
|              "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP", | ||||
|              "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP", | ||||
|              "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT", | ||||
|              "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV", | ||||
|              "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE", | ||||
|              "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV", | ||||
|              "ANVL"] | ||||
|   }, | ||||
|   { "name" : "81480_BIPOR", | ||||
|     "RefSeqID" : "XP_007682909", | ||||
|     "UniProtID" : "W6ZKJ4", | ||||
|     "taxonomyID" : "930090", | ||||
|     "sequence" : [ | ||||
|              "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN", | ||||
|              "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI", | ||||
|              "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP", | ||||
|              "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE", | ||||
|              "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK", | ||||
|              "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"] | ||||
|   }, | ||||
|   { "name" : "01622_ASPNI", | ||||
|     "RefSeqID" : "XP_657766", | ||||
|     "UniProtID" : "Q5BH18", | ||||
|     "taxonomyID" : "227321", | ||||
|     "sequence" : [ | ||||
|              "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP", | ||||
|              "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA", | ||||
|              "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL", | ||||
|              "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV", | ||||
|              "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE", | ||||
|              "RVRNRALMGVTAAFALAKPALVLLEA"] | ||||
|   }, | ||||
|   { "name" : "05405_ASPNI", | ||||
|     "RefSeqID" : "XP_663009", | ||||
|     "UniProtID" : "Q5B225", | ||||
|     "taxonomyID" : "227321", | ||||
|     "sequence" : [ | ||||
|              "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD", | ||||
|              "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM", | ||||
|              "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA", | ||||
|              "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS", | ||||
|              "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP", | ||||
|              "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR", | ||||
|              "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD", | ||||
|              "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"] | ||||
|   }, | ||||
|   { "name" : "105954_BIPOR", | ||||
|     "RefSeqID" : "XP_007691967", | ||||
|     "UniProtID" : "W6Z1H5", | ||||
|     "taxonomyID" : "930090", | ||||
|     "sequence" : [ | ||||
|              "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI", | ||||
|              "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR", | ||||
|              "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE", | ||||
|              "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP", | ||||
|              "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV", | ||||
|              "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"] | ||||
|   }, | ||||
|   { "name" : "69819_WALME", | ||||
|     "RefSeqID" : "XP_006959479", | ||||
|     "UniProtID" : "I4Y911", | ||||
|     "taxonomyID" : "671144", | ||||
|     "sequence" : [ | ||||
|              "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI", | ||||
|              "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS", | ||||
|              "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL", | ||||
|              "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ", | ||||
|              "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"] | ||||
|   }, | ||||
|   { "name" : "02840_CRYNE", | ||||
|     "RefSeqID" : "XP_568872", | ||||
|     "UniProtID" : "Q5KM59", | ||||
|     "taxonomyID" : "214684", | ||||
|     "sequence" : [ | ||||
|              "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG", | ||||
|              "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV", | ||||
|              "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT", | ||||
|              "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA", | ||||
|              "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"] | ||||
|   }, | ||||
|   { "name" : "11055_USTMA", | ||||
|     "RefSeqID" : "XP_011390537", | ||||
|     "UniProtID" : "A0A0D1DZM8", | ||||
|     "taxonomyID" : "237631", | ||||
|     "sequence" : [ | ||||
|              "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK", | ||||
|              "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF", | ||||
|              "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV", | ||||
|              "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR", | ||||
|              "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV", | ||||
|              "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL", | ||||
|              "ASILPW"] | ||||
|   }, | ||||
|   { "name" : "XBP1_NEUCR", | ||||
|     "RefSeqID" : "XP_962373", | ||||
|     "UniProtID" : "Q7S9W7", | ||||
|     "taxonomyID" : "367110", | ||||
|     "sequence" : [ | ||||
|              "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT", | ||||
|              "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR", | ||||
|              "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH", | ||||
|              "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP", | ||||
|              "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ", | ||||
|              "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT", | ||||
|              "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT", | ||||
|              "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD", | ||||
|              "DEGDYEHEQQYRRKRRRLLLVGRAKSF"] | ||||
|   }, | ||||
|   { "name" : "XBP1_SACCE", | ||||
|     "RefSeqID" : "NP_012165", | ||||
|     "UniProtID" : "P40489", | ||||
|     "taxonomyID" : "559292", | ||||
|     "sequence" : [ | ||||
|              "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA", | ||||
|              "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR", | ||||
|              "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE", | ||||
|              "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS", | ||||
|              "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ", | ||||
|              "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND", | ||||
|              "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY", | ||||
|              "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK", | ||||
|              "FKTNSKQ"] | ||||
|   } | ||||
| ] | ||||
|   | ||||
| @@ -1,116 +1,116 @@ | ||||
| [ | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"}, | ||||
|  | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"}, | ||||
|  | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"}, | ||||
|  | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"}, | ||||
|  | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"}, | ||||
|  | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"}, | ||||
|  | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"}, | ||||
|  | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"}, | ||||
|  | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"}, | ||||
|  | ||||
|   {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"} | ||||
| ] | ||||
| [ | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"}, | ||||
|   {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"}, | ||||
|  | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"}, | ||||
|   {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"}, | ||||
|  | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"}, | ||||
|   {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"}, | ||||
|  | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"}, | ||||
|   {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"}, | ||||
|  | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"}, | ||||
|   {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"}, | ||||
|  | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"}, | ||||
|   {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"}, | ||||
|  | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"}, | ||||
|   {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"}, | ||||
|  | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"}, | ||||
|   {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"}, | ||||
|  | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"}, | ||||
|   {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"}, | ||||
|  | ||||
|   {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"}, | ||||
|   {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"} | ||||
| ] | ||||
|   | ||||
| @@ -1,47 +1,47 @@ | ||||
| [ | ||||
|   { "name" : "APSES fold", | ||||
|     "description " : "DNA binding domain by similarity to structure", | ||||
|     "sourceDB" : "PDB", | ||||
|     "accession" : "1BM8_A_1_99"}, | ||||
|  | ||||
|   { "name" : "KilA-N", | ||||
|     "description " : "DNA binding domain by Pfam annotation", | ||||
|     "sourceDB" : "Pfam", | ||||
|     "accession" : "PF04383"}, | ||||
|  | ||||
|   { "name" : "AT hook", | ||||
|     "description " : "DNA interaction motif by SMART annotation", | ||||
|     "sourceDB" : "SMART", | ||||
|     "accession" : null}, | ||||
|  | ||||
|   { "name" : "low complexity", | ||||
|     "description " : "SEG annotation by SMART", | ||||
|     "sourceDB" : "SMART", | ||||
|     "accession" : null}, | ||||
|  | ||||
|   { "name" : "Ankyrin fold", | ||||
|     "description " : "Ankyrin domain by SMART annotation", | ||||
|     "sourceDB" : "SMART", | ||||
|     "accession" : "SM00248"}, | ||||
|  | ||||
|   { "name" : "Swi6 fold", | ||||
|     "description " : "Swi6 fold by similarity to structure", | ||||
|     "sourceDB" : "PDB", | ||||
|     "accession" : "1SW6_B"}, | ||||
|  | ||||
|   { "name" : "coiled coil", | ||||
|     "description " : "Coiled coil by SMART annotation", | ||||
|     "sourceDB" : "SMART", | ||||
|     "accession" : null}, | ||||
|  | ||||
|   { "name" : "McInerny 2011", | ||||
|     "description " : "Yeast cell cycle review", | ||||
|     "sourceDB" : "PubMed", | ||||
|     "accession" : "21310294"} | ||||
| ] | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| [ | ||||
|   { "name" : "APSES fold", | ||||
|     "description " : "DNA binding domain by similarity to structure", | ||||
|     "sourceDB" : "PDB", | ||||
|     "accession" : "1BM8_A_1_99"}, | ||||
|  | ||||
|   { "name" : "KilA-N", | ||||
|     "description " : "DNA binding domain by Pfam annotation", | ||||
|     "sourceDB" : "Pfam", | ||||
|     "accession" : "PF04383"}, | ||||
|  | ||||
|   { "name" : "AT hook", | ||||
|     "description " : "DNA interaction motif by SMART annotation", | ||||
|     "sourceDB" : "SMART", | ||||
|     "accession" : null}, | ||||
|  | ||||
|   { "name" : "low complexity", | ||||
|     "description " : "SEG annotation by SMART", | ||||
|     "sourceDB" : "SMART", | ||||
|     "accession" : null}, | ||||
|  | ||||
|   { "name" : "Ankyrin fold", | ||||
|     "description " : "Ankyrin domain by SMART annotation", | ||||
|     "sourceDB" : "SMART", | ||||
|     "accession" : "SM00248"}, | ||||
|  | ||||
|   { "name" : "Swi6 fold", | ||||
|     "description " : "Swi6 fold by similarity to structure", | ||||
|     "sourceDB" : "PDB", | ||||
|     "accession" : "1SW6_B"}, | ||||
|  | ||||
|   { "name" : "coiled coil", | ||||
|     "description " : "Coiled coil by SMART annotation", | ||||
|     "sourceDB" : "SMART", | ||||
|     "accession" : null}, | ||||
|  | ||||
|   { "name" : "McInerny 2011", | ||||
|     "description " : "Yeast cell cycle review", | ||||
|     "sourceDB" : "PubMed", | ||||
|     "accession" : "21310294"} | ||||
| ] | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -1,155 +1,155 @@ | ||||
| [ | ||||
|   { "name" : "MBP1_SCHPO", | ||||
|     "RefSeqID" : "NP_593032", | ||||
|     "UniProtID" : "P41412", | ||||
|     "taxonomyID" : 284812, | ||||
|     "sequence" : [ | ||||
|        "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ", | ||||
|        "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS", | ||||
|        "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK", | ||||
|        "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL", | ||||
|        "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI", | ||||
|        "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL", | ||||
|        "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL", | ||||
|        "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS", | ||||
|        "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI", | ||||
|        "AMSCGINPEDLSLEILDAVEEALTREK"] | ||||
|   }, | ||||
|   { "name" : "MBP1_ASPNI", | ||||
|     "RefSeqID" : "XP_660758", | ||||
|     "UniProtID" : "Q5B8H6", | ||||
|     "taxonomyID" : 227321, | ||||
|     "sequence" : [ | ||||
|        "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV", | ||||
|        "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV", | ||||
|        "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA", | ||||
|        "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL", | ||||
|        "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS", | ||||
|        "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS", | ||||
|        "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL", | ||||
|        "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE", | ||||
|        "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF", | ||||
|        "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"] | ||||
|   }, | ||||
|   { "name" : "MBP1_BIPOR", | ||||
|     "RefSeqID" : "XP_007682304", | ||||
|     "UniProtID" : "W6ZM86", | ||||
|     "taxonomyID" : 930090, | ||||
|     "sequence" : [ | ||||
|        "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV", | ||||
|        "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA", | ||||
|        "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG", | ||||
|        "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV", | ||||
|        "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS", | ||||
|        "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK", | ||||
|        "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA", | ||||
|        "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND", | ||||
|        "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV", | ||||
|        "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"] | ||||
|   }, | ||||
|   { "name" : "MBP1_NEUCR", | ||||
|     "RefSeqID" : "XP_955821", | ||||
|     "UniProtID" : "Q7RW59", | ||||
|     "taxonomyID" : 367110, | ||||
|     "sequence" : [ | ||||
|        "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV", | ||||
|        "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV", | ||||
|        "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS", | ||||
|        "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG", | ||||
|        "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK", | ||||
|        "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP", | ||||
|        "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV", | ||||
|        "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE", | ||||
|        "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE", | ||||
|        "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA", | ||||
|        "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG", | ||||
|        "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"] | ||||
|   }, | ||||
|   { "name" : "MBP1_COPCI", | ||||
|     "RefSeqID" : "XP_001837394", | ||||
|     "UniProtID" : "A8NYC6", | ||||
|     "taxonomyID" : 240176, | ||||
|     "sequence" : [ | ||||
|        "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG", | ||||
|        "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN", | ||||
|        "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY", | ||||
|        "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA", | ||||
|        "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD", | ||||
|        "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN", | ||||
|        "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST", | ||||
|        "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE", | ||||
|        "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ", | ||||
|        "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA", | ||||
|        "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"] | ||||
|   }, | ||||
|   { "name" : "MBP1_CRYNE", | ||||
|     "RefSeqID" : "XP_569090", | ||||
|     "UniProtID" : "Q5KMQ9", | ||||
|     "taxonomyID" : 214684, | ||||
|     "sequence" : [ | ||||
|        "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV", | ||||
|        "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE", | ||||
|        "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM", | ||||
|        "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG", | ||||
|        "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT", | ||||
|        "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK", | ||||
|        "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE", | ||||
|        "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE", | ||||
|        "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR", | ||||
|        "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ", | ||||
|        "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"] | ||||
|   }, | ||||
|   { "name" : "MBP1_PUCGR", | ||||
|     "RefSeqID" : "XP_003327086", | ||||
|     "UniProtID" : "E3KED4", | ||||
|     "taxonomyID" : 418459, | ||||
|     "sequence" : [ | ||||
|        "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY", | ||||
|        "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE", | ||||
|        "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK", | ||||
|        "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN", | ||||
|        "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS", | ||||
|        "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS", | ||||
|        "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH", | ||||
|        "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE", | ||||
|        "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG", | ||||
|        "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD", | ||||
|        "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR", | ||||
|        "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE", | ||||
|        "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL", | ||||
|        "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"] | ||||
|   }, | ||||
|   { "name" : "MBP1_USTMA", | ||||
|     "RefSeqID" : "XP_011392621", | ||||
|     "UniProtID" : "A0A0D1DP35", | ||||
|     "taxonomyID" : 237631, | ||||
|     "sequence" : [ | ||||
|        "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG", | ||||
|        "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS", | ||||
|        "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY", | ||||
|        "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ", | ||||
|        "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY", | ||||
|        "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP", | ||||
|        "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE", | ||||
|        "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS", | ||||
|        "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM", | ||||
|        "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA", | ||||
|        "P"] | ||||
|   }, | ||||
|   { "name" : "MBP1_WALME", | ||||
|     "RefSeqID" : "XP_006957051", | ||||
|     "UniProtID" : "I4YGC0", | ||||
|     "taxonomyID" : 671144, | ||||
|     "sequence" : [ | ||||
|        "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG", | ||||
|        "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK", | ||||
|        "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID", | ||||
|        "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV", | ||||
|        "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS", | ||||
|        "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP", | ||||
|        "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR", | ||||
|        "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE", | ||||
|        "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA", | ||||
|        "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"] | ||||
|   } | ||||
| ] | ||||
| [ | ||||
|   { "name" : "MBP1_SCHPO", | ||||
|     "RefSeqID" : "NP_593032", | ||||
|     "UniProtID" : "P41412", | ||||
|     "taxonomyID" : 284812, | ||||
|     "sequence" : [ | ||||
|        "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ", | ||||
|        "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS", | ||||
|        "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK", | ||||
|        "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL", | ||||
|        "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI", | ||||
|        "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL", | ||||
|        "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL", | ||||
|        "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS", | ||||
|        "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI", | ||||
|        "AMSCGINPEDLSLEILDAVEEALTREK"] | ||||
|   }, | ||||
|   { "name" : "MBP1_ASPNI", | ||||
|     "RefSeqID" : "XP_660758", | ||||
|     "UniProtID" : "Q5B8H6", | ||||
|     "taxonomyID" : 227321, | ||||
|     "sequence" : [ | ||||
|        "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV", | ||||
|        "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV", | ||||
|        "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA", | ||||
|        "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL", | ||||
|        "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS", | ||||
|        "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS", | ||||
|        "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL", | ||||
|        "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE", | ||||
|        "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF", | ||||
|        "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"] | ||||
|   }, | ||||
|   { "name" : "MBP1_BIPOR", | ||||
|     "RefSeqID" : "XP_007682304", | ||||
|     "UniProtID" : "W6ZM86", | ||||
|     "taxonomyID" : 930090, | ||||
|     "sequence" : [ | ||||
|        "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV", | ||||
|        "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA", | ||||
|        "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG", | ||||
|        "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV", | ||||
|        "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS", | ||||
|        "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK", | ||||
|        "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA", | ||||
|        "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND", | ||||
|        "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV", | ||||
|        "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"] | ||||
|   }, | ||||
|   { "name" : "MBP1_NEUCR", | ||||
|     "RefSeqID" : "XP_955821", | ||||
|     "UniProtID" : "Q7RW59", | ||||
|     "taxonomyID" : 367110, | ||||
|     "sequence" : [ | ||||
|        "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV", | ||||
|        "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV", | ||||
|        "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS", | ||||
|        "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG", | ||||
|        "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK", | ||||
|        "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP", | ||||
|        "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV", | ||||
|        "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE", | ||||
|        "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE", | ||||
|        "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA", | ||||
|        "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG", | ||||
|        "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"] | ||||
|   }, | ||||
|   { "name" : "MBP1_COPCI", | ||||
|     "RefSeqID" : "XP_001837394", | ||||
|     "UniProtID" : "A8NYC6", | ||||
|     "taxonomyID" : 240176, | ||||
|     "sequence" : [ | ||||
|        "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG", | ||||
|        "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN", | ||||
|        "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY", | ||||
|        "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA", | ||||
|        "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD", | ||||
|        "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN", | ||||
|        "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST", | ||||
|        "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE", | ||||
|        "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ", | ||||
|        "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA", | ||||
|        "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"] | ||||
|   }, | ||||
|   { "name" : "MBP1_CRYNE", | ||||
|     "RefSeqID" : "XP_569090", | ||||
|     "UniProtID" : "Q5KMQ9", | ||||
|     "taxonomyID" : 214684, | ||||
|     "sequence" : [ | ||||
|        "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV", | ||||
|        "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE", | ||||
|        "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM", | ||||
|        "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG", | ||||
|        "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT", | ||||
|        "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK", | ||||
|        "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE", | ||||
|        "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE", | ||||
|        "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR", | ||||
|        "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ", | ||||
|        "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"] | ||||
|   }, | ||||
|   { "name" : "MBP1_PUCGR", | ||||
|     "RefSeqID" : "XP_003327086", | ||||
|     "UniProtID" : "E3KED4", | ||||
|     "taxonomyID" : 418459, | ||||
|     "sequence" : [ | ||||
|        "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY", | ||||
|        "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE", | ||||
|        "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK", | ||||
|        "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN", | ||||
|        "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS", | ||||
|        "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS", | ||||
|        "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH", | ||||
|        "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE", | ||||
|        "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG", | ||||
|        "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD", | ||||
|        "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR", | ||||
|        "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE", | ||||
|        "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL", | ||||
|        "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"] | ||||
|   }, | ||||
|   { "name" : "MBP1_USTMA", | ||||
|     "RefSeqID" : "XP_011392621", | ||||
|     "UniProtID" : "A0A0D1DP35", | ||||
|     "taxonomyID" : 237631, | ||||
|     "sequence" : [ | ||||
|        "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG", | ||||
|        "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS", | ||||
|        "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY", | ||||
|        "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ", | ||||
|        "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY", | ||||
|        "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP", | ||||
|        "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE", | ||||
|        "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS", | ||||
|        "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM", | ||||
|        "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA", | ||||
|        "P"] | ||||
|   }, | ||||
|   { "name" : "MBP1_WALME", | ||||
|     "RefSeqID" : "XP_006957051", | ||||
|     "UniProtID" : "I4YGC0", | ||||
|     "taxonomyID" : 671144, | ||||
|     "sequence" : [ | ||||
|        "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG", | ||||
|        "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK", | ||||
|        "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID", | ||||
|        "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV", | ||||
|        "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS", | ||||
|        "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP", | ||||
|        "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR", | ||||
|        "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE", | ||||
|        "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA", | ||||
|        "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"] | ||||
|   } | ||||
| ] | ||||
|   | ||||
| @@ -1,22 +1,22 @@ | ||||
| [ | ||||
|   { "ID" : 227321, | ||||
|     "species" : "Aspergillus nidulans FGSC A4"}, | ||||
|   { "ID" : 930090, | ||||
|     "species" : "Bipolaris oryzae ATCC 44560"}, | ||||
|   { "ID" : 240176, | ||||
|     "species" : "Coprinopsis cinerea okayama7#130"}, | ||||
|   { "ID" : 214684, | ||||
|     "species" : "Cryptococcus neoformans var. neoformans JEC21"}, | ||||
|   { "ID" : 367110, | ||||
|     "species" : "Neurospora crassa OR74A"}, | ||||
|   { "ID" : 418459, | ||||
|     "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"}, | ||||
|   { "ID" : 559292, | ||||
|     "species" : "Saccharomyces cerevisiae S288C"}, | ||||
|   { "ID" : 284812, | ||||
|     "species" : "Schizosaccharomyces pombe 972h-"}, | ||||
|   { "ID" : 237631, | ||||
|     "species" : "Ustilago maydis 521"}, | ||||
|   { "ID" : 671144, | ||||
|     "species" : "Wallemia mellicola CBS 633.66"} | ||||
| ] | ||||
| [ | ||||
|   { "ID" : 227321, | ||||
|     "species" : "Aspergillus nidulans FGSC A4"}, | ||||
|   { "ID" : 930090, | ||||
|     "species" : "Bipolaris oryzae ATCC 44560"}, | ||||
|   { "ID" : 240176, | ||||
|     "species" : "Coprinopsis cinerea okayama7#130"}, | ||||
|   { "ID" : 214684, | ||||
|     "species" : "Cryptococcus neoformans var. neoformans JEC21"}, | ||||
|   { "ID" : 367110, | ||||
|     "species" : "Neurospora crassa OR74A"}, | ||||
|   { "ID" : 418459, | ||||
|     "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"}, | ||||
|   { "ID" : 559292, | ||||
|     "species" : "Saccharomyces cerevisiae S288C"}, | ||||
|   { "ID" : 284812, | ||||
|     "species" : "Schizosaccharomyces pombe 972h-"}, | ||||
|   { "ID" : 237631, | ||||
|     "species" : "Ustilago maydis 521"}, | ||||
|   { "ID" : 671144, | ||||
|     "species" : "Wallemia mellicola CBS 633.66"} | ||||
| ] | ||||
|   | ||||
| @@ -1,115 +1,115 @@ | ||||
| ID	protein.ID	feature.ID	start	end	note | ||||
| # MBP1_SACCE | ||||
| NA	ref_pro_4	ref_ftr_1	4	102	APSES fold | ||||
| NA	ref_pro_4	ref_ftr_2	22	105	KilA-N | ||||
| NA	ref_pro_4	ref_ftr_4	108	122	low complexity | ||||
| NA	ref_pro_4	ref_ftr_4	236	241	low complexity | ||||
| NA	ref_pro_4	ref_ftr_4	279	307	low complexity | ||||
| NA	ref_pro_4	ref_ftr_4	700	717	low complexity | ||||
| NA	ref_pro_4	ref_ftr_4	700	717	low complexity | ||||
| NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin | ||||
| NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin | ||||
| NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin | ||||
| NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold | ||||
| NA	ref_pro_4	ref_ftr_7	633	655	coiled coil | ||||
| # MBP1_ASPNI | ||||
| NA	ref_pro_1	ref_ftr_1	9	106	APSES fold | ||||
| NA	ref_pro_1	ref_ftr_2	26	109	KilA-N | ||||
| NA	ref_pro_1	ref_ftr_4	529	534	low complexity | ||||
| NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin | ||||
| NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin | ||||
| NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold | ||||
| NA	ref_pro_1	ref_ftr_7	509	572	coiled coil | ||||
| # MBP1_BIPOR | ||||
| NA	ref_pro_2	ref_ftr_1	8	106	APSES fold | ||||
| NA	ref_pro_2	ref_ftr_2	26	109	KilA-N | ||||
| NA	ref_pro_2	ref_ftr_4	134	152	low complexity | ||||
| NA	ref_pro_2	ref_ftr_4	267	278	low complexity | ||||
| NA	ref_pro_2	ref_ftr_4	670	685	low complexity | ||||
| NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin | ||||
| NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin | ||||
| NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold | ||||
| NA	ref_pro_2	ref_ftr_7	659	681	coiled coil | ||||
| NA	ref_pro_2	ref_ftr_7	500	590	coiled coil | ||||
| # MBP1_NEUCR | ||||
| NA	ref_pro_3	ref_ftr_1	14	114	APSES fold | ||||
| NA	ref_pro_3	ref_ftr_2	34	117	KilA-N | ||||
| NA	ref_pro_3	ref_ftr_4	130	141	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	253	266	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	514	525	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	554	564	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	601	618	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	620	629	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	636	652	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	658	672	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	725	735	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	752	771	low complexity | ||||
| NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin | ||||
| NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin | ||||
| NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold | ||||
| NA	ref_pro_3	ref_ftr_7	500	550	coiled coil | ||||
| # MBP1_SCHPO | ||||
| NA	ref_pro_5	ref_ftr_1	8	104	APSES fold | ||||
| NA	ref_pro_5	ref_ftr_2	25	113	KilA-N | ||||
| NA	ref_pro_5	ref_ftr_4	111	125	low complexity | ||||
| NA	ref_pro_5	ref_ftr_4	136	145	low complexity | ||||
| NA	ref_pro_5	ref_ftr_4	176	191	low complexity | ||||
| NA	ref_pro_5	ref_ftr_4	422	447	low complexity | ||||
| NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin | ||||
| NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin | ||||
| NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold | ||||
| NA	ref_pro_5	ref_ftr_7	457	538	coiled coil | ||||
| # MBP1_COPCI | ||||
| NA	ref_pro_6	ref_ftr_1	5	103	APSES fold | ||||
| NA	ref_pro_6	ref_ftr_2	23	106	KilA-N | ||||
| NA	ref_pro_6	ref_ftr_4	170	191	low complexity | ||||
| NA	ref_pro_6	ref_ftr_4	435	450	low complexity | ||||
| NA	ref_pro_6	ref_ftr_4	611	626	low complexity | ||||
| NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin | ||||
| NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin | ||||
| NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin | ||||
| NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold | ||||
| NA	ref_pro_6	ref_ftr_7	500	570	coiled coil | ||||
| NA	ref_pro_6	ref_ftr_7	651	678	coiled coil | ||||
| # MBP1_CRYNE | ||||
| NA	ref_pro_7	ref_ftr_1	113	211	APSES fold | ||||
| NA	ref_pro_7	ref_ftr_2	131	215	KilA-N | ||||
| NA	ref_pro_7	ref_ftr_4	66	85	low complexity | ||||
| NA	ref_pro_7	ref_ftr_4	413	423	low complexity | ||||
| NA	ref_pro_7	ref_ftr_4	633	644	low complexity | ||||
| NA	ref_pro_7	ref_ftr_4	697	709	low complexity | ||||
| NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin | ||||
| NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin | ||||
| NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold | ||||
| # MBP1_PUCGR | ||||
| NA	ref_pro_8	ref_ftr_1	90	187	APSES fold | ||||
| NA	ref_pro_8	ref_ftr_2	107	190	KilA-N | ||||
| NA	ref_pro_8	ref_ftr_4	208	227	low complexity | ||||
| NA	ref_pro_8	ref_ftr_4	273	291	low complexity | ||||
| NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin | ||||
| NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin | ||||
| NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin | ||||
| NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold | ||||
| NA	ref_pro_8	ref_ftr_7	827	863	coiled coil | ||||
| # MBP1_USTMA | ||||
| NA	ref_pro_9	ref_ftr_1	7	104	APSES fold | ||||
| NA	ref_pro_9	ref_ftr_2	24	107	KilA-N | ||||
| NA	ref_pro_9	ref_ftr_4	106	116	low complexity | ||||
| NA	ref_pro_9	ref_ftr_4	161	183	low complexity | ||||
| NA	ref_pro_9	ref_ftr_4	657	672	low complexity | ||||
| NA	ref_pro_9	ref_ftr_4	776	796	low complexity | ||||
| NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin | ||||
| NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin | ||||
| NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold | ||||
| NA	ref_pro_9	ref_ftr_7	581	609	coiled coil | ||||
| # MBP1_WALME | ||||
| NA	ref_pro_10	ref_ftr_1	6	103	APSES fold | ||||
| NA	ref_pro_10	ref_ftr_2	23	106	KilA-N | ||||
| NA	ref_pro_10	ref_ftr_4	149	162	low complexity | ||||
| NA	ref_pro_10	ref_ftr_4	171	188	low complexity | ||||
| NA	ref_pro_10	ref_ftr_4	618	628	low complexity | ||||
| NA	ref_pro_10	ref_ftr_4	634	660	low complexity | ||||
| NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin | ||||
| NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin | ||||
| NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold | ||||
| NA	ref_pro_10	ref_ftr_7	461	585	coiled coil | ||||
| ID	protein.ID	feature.ID	start	end	note | ||||
| # MBP1_SACCE | ||||
| NA	ref_pro_4	ref_ftr_1	4	102	APSES fold | ||||
| NA	ref_pro_4	ref_ftr_2	22	105	KilA-N | ||||
| NA	ref_pro_4	ref_ftr_4	108	122	low complexity | ||||
| NA	ref_pro_4	ref_ftr_4	236	241	low complexity | ||||
| NA	ref_pro_4	ref_ftr_4	279	307	low complexity | ||||
| NA	ref_pro_4	ref_ftr_4	700	717	low complexity | ||||
| NA	ref_pro_4	ref_ftr_4	700	717	low complexity | ||||
| NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin | ||||
| NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin | ||||
| NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin | ||||
| NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold | ||||
| NA	ref_pro_4	ref_ftr_7	633	655	coiled coil | ||||
| # MBP1_ASPNI | ||||
| NA	ref_pro_1	ref_ftr_1	9	106	APSES fold | ||||
| NA	ref_pro_1	ref_ftr_2	26	109	KilA-N | ||||
| NA	ref_pro_1	ref_ftr_4	529	534	low complexity | ||||
| NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin | ||||
| NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin | ||||
| NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold | ||||
| NA	ref_pro_1	ref_ftr_7	509	572	coiled coil | ||||
| # MBP1_BIPOR | ||||
| NA	ref_pro_2	ref_ftr_1	8	106	APSES fold | ||||
| NA	ref_pro_2	ref_ftr_2	26	109	KilA-N | ||||
| NA	ref_pro_2	ref_ftr_4	134	152	low complexity | ||||
| NA	ref_pro_2	ref_ftr_4	267	278	low complexity | ||||
| NA	ref_pro_2	ref_ftr_4	670	685	low complexity | ||||
| NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin | ||||
| NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin | ||||
| NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold | ||||
| NA	ref_pro_2	ref_ftr_7	659	681	coiled coil | ||||
| NA	ref_pro_2	ref_ftr_7	500	590	coiled coil | ||||
| # MBP1_NEUCR | ||||
| NA	ref_pro_3	ref_ftr_1	14	114	APSES fold | ||||
| NA	ref_pro_3	ref_ftr_2	34	117	KilA-N | ||||
| NA	ref_pro_3	ref_ftr_4	130	141	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	253	266	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	514	525	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	554	564	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	601	618	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	620	629	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	636	652	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	658	672	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	725	735	low complexity | ||||
| NA	ref_pro_3	ref_ftr_4	752	771	low complexity | ||||
| NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin | ||||
| NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin | ||||
| NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold | ||||
| NA	ref_pro_3	ref_ftr_7	500	550	coiled coil | ||||
| # MBP1_SCHPO | ||||
| NA	ref_pro_5	ref_ftr_1	8	104	APSES fold | ||||
| NA	ref_pro_5	ref_ftr_2	25	113	KilA-N | ||||
| NA	ref_pro_5	ref_ftr_4	111	125	low complexity | ||||
| NA	ref_pro_5	ref_ftr_4	136	145	low complexity | ||||
| NA	ref_pro_5	ref_ftr_4	176	191	low complexity | ||||
| NA	ref_pro_5	ref_ftr_4	422	447	low complexity | ||||
| NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin | ||||
| NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin | ||||
| NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold | ||||
| NA	ref_pro_5	ref_ftr_7	457	538	coiled coil | ||||
| # MBP1_COPCI | ||||
| NA	ref_pro_6	ref_ftr_1	5	103	APSES fold | ||||
| NA	ref_pro_6	ref_ftr_2	23	106	KilA-N | ||||
| NA	ref_pro_6	ref_ftr_4	170	191	low complexity | ||||
| NA	ref_pro_6	ref_ftr_4	435	450	low complexity | ||||
| NA	ref_pro_6	ref_ftr_4	611	626	low complexity | ||||
| NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin | ||||
| NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin | ||||
| NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin | ||||
| NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold | ||||
| NA	ref_pro_6	ref_ftr_7	500	570	coiled coil | ||||
| NA	ref_pro_6	ref_ftr_7	651	678	coiled coil | ||||
| # MBP1_CRYNE | ||||
| NA	ref_pro_7	ref_ftr_1	113	211	APSES fold | ||||
| NA	ref_pro_7	ref_ftr_2	131	215	KilA-N | ||||
| NA	ref_pro_7	ref_ftr_4	66	85	low complexity | ||||
| NA	ref_pro_7	ref_ftr_4	413	423	low complexity | ||||
| NA	ref_pro_7	ref_ftr_4	633	644	low complexity | ||||
| NA	ref_pro_7	ref_ftr_4	697	709	low complexity | ||||
| NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin | ||||
| NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin | ||||
| NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold | ||||
| # MBP1_PUCGR | ||||
| NA	ref_pro_8	ref_ftr_1	90	187	APSES fold | ||||
| NA	ref_pro_8	ref_ftr_2	107	190	KilA-N | ||||
| NA	ref_pro_8	ref_ftr_4	208	227	low complexity | ||||
| NA	ref_pro_8	ref_ftr_4	273	291	low complexity | ||||
| NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin | ||||
| NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin | ||||
| NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin | ||||
| NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold | ||||
| NA	ref_pro_8	ref_ftr_7	827	863	coiled coil | ||||
| # MBP1_USTMA | ||||
| NA	ref_pro_9	ref_ftr_1	7	104	APSES fold | ||||
| NA	ref_pro_9	ref_ftr_2	24	107	KilA-N | ||||
| NA	ref_pro_9	ref_ftr_4	106	116	low complexity | ||||
| NA	ref_pro_9	ref_ftr_4	161	183	low complexity | ||||
| NA	ref_pro_9	ref_ftr_4	657	672	low complexity | ||||
| NA	ref_pro_9	ref_ftr_4	776	796	low complexity | ||||
| NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin | ||||
| NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin | ||||
| NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold | ||||
| NA	ref_pro_9	ref_ftr_7	581	609	coiled coil | ||||
| # MBP1_WALME | ||||
| NA	ref_pro_10	ref_ftr_1	6	103	APSES fold | ||||
| NA	ref_pro_10	ref_ftr_2	23	106	KilA-N | ||||
| NA	ref_pro_10	ref_ftr_4	149	162	low complexity | ||||
| NA	ref_pro_10	ref_ftr_4	171	188	low complexity | ||||
| NA	ref_pro_10	ref_ftr_4	618	628	low complexity | ||||
| NA	ref_pro_10	ref_ftr_4	634	660	low complexity | ||||
| NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin | ||||
| NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin | ||||
| NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold | ||||
| NA	ref_pro_10	ref_ftr_7	461	585	coiled coil | ||||
|   | ||||
| @@ -1,37 +1,37 @@ | ||||
| # functionTemplate.R | ||||
| # | ||||
| # Purpose:  (General) | ||||
| # | ||||
| # ToDo: | ||||
| # Notes: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| myFunction <- function(a, b=1) { | ||||
| 	# Purpose: | ||||
| 	#     Describe ... | ||||
|     # Version: | ||||
|     # Date: | ||||
|     # Author: | ||||
|     # | ||||
|     # Parameters: | ||||
| 	#     a: ... | ||||
| 	#     b: ... | ||||
| 	# Value: | ||||
| 	#     result: ... | ||||
| 	# Example: <example invocation> | ||||
|  | ||||
| 	# code ... | ||||
|  | ||||
| 	return(result) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ====  TESTS  ================================================================= | ||||
| # Enter your function tests here... | ||||
|  | ||||
| if (FALSE) { | ||||
|   # test ... | ||||
| } | ||||
|  | ||||
| # [END] | ||||
| # functionTemplate.R | ||||
| # | ||||
| # Purpose:  (General) | ||||
| # | ||||
| # ToDo: | ||||
| # Notes: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| myFunction <- function(a, b=1) { | ||||
| 	# Purpose: | ||||
| 	#     Describe ... | ||||
|     # Version: | ||||
|     # Date: | ||||
|     # Author: | ||||
|     # | ||||
|     # Parameters: | ||||
| 	#     a: ... | ||||
| 	#     b: ... | ||||
| 	# Value: | ||||
| 	#     result: ... | ||||
| 	# Example: <example invocation> | ||||
|  | ||||
| 	# code ... | ||||
|  | ||||
| 	return(result) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ====  TESTS  ================================================================= | ||||
| # Enter your function tests here... | ||||
|  | ||||
| if (FALSE) { | ||||
|   # test ... | ||||
| } | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,21 +1,21 @@ | ||||
| # .myProfile.R | ||||
| # This contains information which the course framework needs from time to time | ||||
| # to personalize assignments, validate submissions etc. Make sure that | ||||
| # the information correctly matches our official records. | ||||
| # myEmail          char      A string with your eMail address. Use your official | ||||
| #                            UofT eMail address. | ||||
| # myStudentNumber  numeric   Your UofT student number. Take care to have this | ||||
| #                            correct. | ||||
| # | ||||
| # NOTE: | ||||
| # After you have updated this script, move the file to your "myScripts" folder. | ||||
| # Utility scripts will look for it on the path: "./myScripts/.myProfile.R" | ||||
| # | ||||
| # ============================================================================== | ||||
| # options(stringsAsFactors = FALSE) | ||||
|  | ||||
| myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca" | ||||
| myStudentNumber <- 1005845285  # e.g. 1003141592 | ||||
| MYSPE <- "Cutaneotrichosporon oleaginosum"  | ||||
|  | ||||
| # [END] | ||||
| # .myProfile.R | ||||
| # This contains information which the course framework needs from time to time | ||||
| # to personalize assignments, validate submissions etc. Make sure that | ||||
| # the information correctly matches our official records. | ||||
| # myEmail          char      A string with your eMail address. Use your official | ||||
| #                            UofT eMail address. | ||||
| # myStudentNumber  numeric   Your UofT student number. Take care to have this | ||||
| #                            correct. | ||||
| # | ||||
| # NOTE: | ||||
| # After you have updated this script, move the file to your "myScripts" folder. | ||||
| # Utility scripts will look for it on the path: "./myScripts/.myProfile.R" | ||||
| # | ||||
| # ============================================================================== | ||||
| # options(stringsAsFactors = FALSE) | ||||
|  | ||||
| myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca" | ||||
| myStudentNumber <- 1005845285  # e.g. 1003141592 | ||||
| MYSPE <- "Cutaneotrichosporon oleaginosum"  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,54 +1,51 @@ | ||||
| myFA <-             readFASTA("data/RAB39B_HSa_coding.fa") | ||||
| myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa")) | ||||
| myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa")) | ||||
| myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa")) | ||||
| rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names | ||||
|  | ||||
| gen_mutations <- function(seq, N) { | ||||
|   stats <- c() | ||||
|   stats <- cbind(stats, c(0, 0, 0)) | ||||
|   rownames(stats) <- c("silent", "missense", "nonsense") | ||||
|   colnames(stats) <- c("occurrences") | ||||
|   # Actual function | ||||
|   for (i in 1:217) { | ||||
|     # select index for mutation | ||||
|     working_seq <- Biostrings::DNAString(seq) | ||||
|     aa_seq <- Biostrings::translate(working_seq, no.init.codon = TRUE) | ||||
|     mut_action <- sample(c("ins", "del", "sub"), 1, TRUE) | ||||
|     mut_seq <- Biostrings::DNAString(seq) | ||||
|     if (mut_action == "sub") { | ||||
|       mut_index <- sample(1:length(working_seq), 1, replace = TRUE) | ||||
|       possible_mutations <- Biostrings::DNA_BASES | ||||
|       possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(working_seq[mut_index]))] | ||||
|       mut_change <- sample(possible_mutations, 1, replace = TRUE) | ||||
|       mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, mut_change) | ||||
|     } else if (mut_action == "ins") { | ||||
|       mut_index <- sample(1:length(working_seq) - 2, 1, replace = TRUE) | ||||
|       possible_mutations <- Biostrings::DNA_BASES | ||||
|       mut_seq <- Biostrings::DNAString(paste(substring(working_seq, 1, mut_index - 1), sample(possible_mutations, 1), substring(working_seq, mut_index), sep = "")) | ||||
|     } else { | ||||
|       mut_index <- sample(1:length(working_seq), 1, replace = TRUE) | ||||
|       mut_seq <- mut_seq[-mut_index] | ||||
|     } | ||||
|     mut_seq <- Biostrings::DNAString(substring(mut_seq, 1, length(mut_seq) - (length(mut_seq) %% 3))) | ||||
|     mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE) | ||||
|  | ||||
|     # Note: we need silent, nonsense, and missense | ||||
|     mut_aa_stop <- match("*", Biostrings::as.matrix(mut_aa)) | ||||
|     aa_seq_stop <- match("*", Biostrings::as.matrix(aa_seq)) | ||||
|     if (!is.na(mut_aa_stop) & (is.na(aa_seq_stop) | mut_aa_stop < aa_seq_stop)) { | ||||
|       stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"] | ||||
|     } else if (mut_aa == aa_seq) { | ||||
|       stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"] | ||||
|     } else { | ||||
|       stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"] | ||||
|     } | ||||
|   } | ||||
|   return(stats) | ||||
| } | ||||
| N_test <- 1200 | ||||
| gen_mutations("ATGATGATGATGATGATG", N_test) | ||||
| gen_mutations("CCCCCCCCCCCCCCCCCC", N_test) | ||||
| gen_mutations("TATTACTATTACTATTAC", N_test) | ||||
| gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", N_test) | ||||
| gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", N_test) | ||||
| gen_mutations <- function(seq, N) { | ||||
|   sealKey() # See: http://steipe.biochemistry.utoronto.ca/abc/index.php/BCH441_Code_submisson_instructions | ||||
|   stats <- c() | ||||
|   stats <- cbind(stats, c(0, 0, 0)) | ||||
|   rownames(stats) <- c("silent", "missense", "nonsense") | ||||
|   colnames(stats) <- c("occurrences") | ||||
|   # Actual function | ||||
|   for (i in 1:N) { | ||||
|     original_seq <- Biostrings::DNAString(seq) | ||||
|     aa_seq <- Biostrings::translate(original_seq, no.init.codon = TRUE) | ||||
|  | ||||
|     mut_seq <- Biostrings::DNAString(seq) | ||||
|     mut_index <- sample(1:length(original_seq), 1, replace = TRUE) | ||||
|     possible_mutations <- Biostrings::DNA_BASES | ||||
|     possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(original_seq[mut_index]))] | ||||
|     mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, sample(possible_mutations, 1, replace = TRUE)) | ||||
|     mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE) | ||||
|  | ||||
|  | ||||
|     term_aa <- regexpr(pattern = "\\*", aa_seq) | ||||
|     term_mut_aa <- as.integer(regexpr(pattern = "\\*", mut_aa)) | ||||
|     if ((term_aa == -1 && term_mut_aa != -1) || (term_mut_aa != -1 && term_mut_aa < term_aa)) { | ||||
|       stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"] | ||||
|     } else if (mut_aa == aa_seq) { | ||||
|       stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"] | ||||
|     } else { | ||||
|       stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"] | ||||
|     } | ||||
|   } | ||||
|   sealKey() | ||||
|   return(stats) | ||||
| } | ||||
|  | ||||
| gen_mutations("ATGATGATGATGATGATG", 1000) | ||||
| gen_mutations("CCCCCCCCCCCCCCCCCC", 500) | ||||
| gen_mutations("TATTACTATTACTATTAC", 500) | ||||
| gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", 500) | ||||
| gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", 500) | ||||
| gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGA", 500) | ||||
|  | ||||
|  | ||||
| myFA <-             readFASTA("data/RAB39B_HSa_coding.fa") | ||||
| myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa")) | ||||
| myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa")) | ||||
| myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa")) | ||||
| rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names | ||||
|  | ||||
| gen_mutations(myFA["RAB39B", 2], 10000) | ||||
| gen_mutations(myFA["PTPN5", 2], 10000) | ||||
| gen_mutations(myFA["PTPN11", 2], 10000) | ||||
| gen_mutations(myFA["KRAS", 2], 10000) | ||||
|   | ||||
| @@ -1,41 +1,41 @@ | ||||
| # ==   1.3  Task: submit for credit (part 1/2)  ================================ | ||||
| # == Submission - Code to add another philosopher to the datamodel: | ||||
|  | ||||
| pID <- autoincrement(philDB$person) | ||||
| immanuelKant <- data.frame(id = pID, | ||||
|                            name = "Immanuel Kant", | ||||
|                            born = "1724", | ||||
|                            died = "1804", | ||||
|                            school = "Enlightenment Philosophy") | ||||
| philDB$person <- rbind(philDB$person, immanuelKant) | ||||
|  | ||||
| bID = autoincrement(philDB$books) | ||||
| immanuelKantWork <- data.frame(id = bID, | ||||
|                                title = "Critique of Pure Reason", | ||||
|                                published = "1781") | ||||
| philDB$books <- rbind(philDB$books, immanuelKantWork) | ||||
| philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) | ||||
|  | ||||
| bID = autoincrement(philDB$books) | ||||
| immanuelKantWork <- data.frame(id = bID, | ||||
|                                title = "Critique of Judgement", | ||||
|                                published = "1790") | ||||
| philDB$books <- rbind(philDB$books, immanuelKantWork) | ||||
| philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) | ||||
|  | ||||
| # == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order. | ||||
|  | ||||
| schools <- unique(philDB$person$school) | ||||
| schools <- sort(schools) | ||||
|  | ||||
| for (s in schools) { | ||||
|   cat(sprintf("%s\n", s)) | ||||
|   authors = which(philDB$person$school == s) | ||||
|   for (author in authors) { | ||||
|     works = which(philDB$works$personID == author) | ||||
|     for (work in works) { | ||||
|       bookId = which(philDB$books$id == philDB$works$bookID[work]) | ||||
|       cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId])) | ||||
|     } | ||||
|   } | ||||
| # ==   1.3  Task: submit for credit (part 1/2)  ================================ | ||||
| # == Submission - Code to add another philosopher to the datamodel: | ||||
|  | ||||
| pID <- autoincrement(philDB$person) | ||||
| immanuelKant <- data.frame(id = pID, | ||||
|                            name = "Immanuel Kant", | ||||
|                            born = "1724", | ||||
|                            died = "1804", | ||||
|                            school = "Enlightenment Philosophy") | ||||
| philDB$person <- rbind(philDB$person, immanuelKant) | ||||
|  | ||||
| bID = autoincrement(philDB$books) | ||||
| immanuelKantWork <- data.frame(id = bID, | ||||
|                                title = "Critique of Pure Reason", | ||||
|                                published = "1781") | ||||
| philDB$books <- rbind(philDB$books, immanuelKantWork) | ||||
| philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) | ||||
|  | ||||
| bID = autoincrement(philDB$books) | ||||
| immanuelKantWork <- data.frame(id = bID, | ||||
|                                title = "Critique of Judgement", | ||||
|                                published = "1790") | ||||
| philDB$books <- rbind(philDB$books, immanuelKantWork) | ||||
| philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) | ||||
|  | ||||
| # == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order. | ||||
|  | ||||
| schools <- unique(philDB$person$school) | ||||
| schools <- sort(schools) | ||||
|  | ||||
| for (s in schools) { | ||||
|   cat(sprintf("%s\n", s)) | ||||
|   authors = which(philDB$person$school == s) | ||||
|   for (author in authors) { | ||||
|     works = which(philDB$works$personID == author) | ||||
|     for (work in works) { | ||||
|       bookId = which(philDB$books$id == philDB$works$bookID[work]) | ||||
|       cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId])) | ||||
|     } | ||||
|   } | ||||
| } | ||||
| @@ -1,4 +1,4 @@ | ||||
| [{ | ||||
| 	"ID": 879819, | ||||
| 	"species": "Cutaneotrichosporon oleaginosum"} | ||||
| ] | ||||
| [{ | ||||
| 	"ID": 879819, | ||||
| 	"species": "Cutaneotrichosporon oleaginosum"} | ||||
| ] | ||||
|   | ||||
| @@ -1,19 +1,19 @@ | ||||
| [ | ||||
|   { "name" : "MBP1_CUTOL", | ||||
|     "RefSeqID" : "XP_018278493.1", | ||||
|     "UniProtID" : "A0A0J0XLN0", | ||||
|     "taxonomyID" : 879819, | ||||
|     "sequence" : [ | ||||
|        "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ", | ||||
|        "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK", | ||||
|        "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS", | ||||
|        "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA", | ||||
|        "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN", | ||||
|        "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD", | ||||
|        "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT", | ||||
|        "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND", | ||||
|        "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD", | ||||
|        "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID", | ||||
|        "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"] | ||||
|   } | ||||
| ] | ||||
| [ | ||||
|   { "name" : "MBP1_CUTOL", | ||||
|     "RefSeqID" : "XP_018278493.1", | ||||
|     "UniProtID" : "A0A0J0XLN0", | ||||
|     "taxonomyID" : 879819, | ||||
|     "sequence" : [ | ||||
|        "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ", | ||||
|        "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK", | ||||
|        "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS", | ||||
|        "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA", | ||||
|        "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN", | ||||
|        "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD", | ||||
|        "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT", | ||||
|        "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND", | ||||
|        "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD", | ||||
|        "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID", | ||||
|        "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"] | ||||
|   } | ||||
| ] | ||||
|   | ||||
| @@ -1,8 +1,8 @@ | ||||
| README - myScripts folder: | ||||
| ========================== | ||||
|  | ||||
| The "myScripts" folder is a place to keep your personal files | ||||
| safe. No files will be submitted into this folder on the GitHub, master | ||||
| copy. Thefore everything you put into this folder is safe from being | ||||
| inadvertently overwritten by a file with the same name that would be | ||||
| downloaded in a GitHub "pull" request. | ||||
| README - myScripts folder: | ||||
| ========================== | ||||
|  | ||||
| The "myScripts" folder is a place to keep your personal files | ||||
| safe. No files will be submitted into this folder on the GitHub, master | ||||
| copy. Thefore everything you put into this folder is safe from being | ||||
| inadvertently overwritten by a file with the same name that would be | ||||
| downloaded in a GitHub "pull" request. | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| source("./scripts/ABC-createRefDB.R") | ||||
|  | ||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json")) | ||||
| myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json")) | ||||
| source("./scripts/ABC-createRefDB.R") | ||||
|  | ||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json")) | ||||
| myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json")) | ||||
|   | ||||
| @@ -1,38 +1,38 @@ | ||||
| # myScript.R | ||||
| # | ||||
| # --- As you work with this file, you can delete the instructions below -------- | ||||
| # Write your notes and code experiments into this document. Save it | ||||
| # from time to time - however I recommend that you do not _commit_ | ||||
| # your saved version. | ||||
| # | ||||
| # As long as you do not _commit_ this script to version control, | ||||
| # you can _pull_ updated versions of the entire project from GitHub | ||||
| # by using the RStudio version control interface. However, once | ||||
| # you _commit_ any file in your local version, RStudio will require | ||||
| # you to resolve conflicts before you can _pull_ updates. | ||||
| # --- As you work with this file, you can delete the instructions above -------- | ||||
| # | ||||
| ## Purpose: <...> | ||||
| # | ||||
| # Version: <...> | ||||
| # | ||||
| # Date:    <...> | ||||
| # Author:  <Name> (<namee@mail.utoronto.ca>) | ||||
| # | ||||
| # Versions: | ||||
| # | ||||
| #   <number>    <Features> | ||||
| # | ||||
| # TODO: | ||||
| #   <...> | ||||
| # | ||||
| # ==================================================================== | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|  | ||||
| # myScript.R | ||||
| # | ||||
| # --- As you work with this file, you can delete the instructions below -------- | ||||
| # Write your notes and code experiments into this document. Save it | ||||
| # from time to time - however I recommend that you do not _commit_ | ||||
| # your saved version. | ||||
| # | ||||
| # As long as you do not _commit_ this script to version control, | ||||
| # you can _pull_ updated versions of the entire project from GitHub | ||||
| # by using the RStudio version control interface. However, once | ||||
| # you _commit_ any file in your local version, RStudio will require | ||||
| # you to resolve conflicts before you can _pull_ updates. | ||||
| # --- As you work with this file, you can delete the instructions above -------- | ||||
| # | ||||
| ## Purpose: <...> | ||||
| # | ||||
| # Version: <...> | ||||
| # | ||||
| # Date:    <...> | ||||
| # Author:  <Name> (<namee@mail.utoronto.ca>) | ||||
| # | ||||
| # Versions: | ||||
| # | ||||
| #   <number>    <Features> | ||||
| # | ||||
| # TODO: | ||||
| #   <...> | ||||
| # | ||||
| # ==================================================================== | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|  | ||||
|   | ||||
							
								
								
									
										2868
									
								
								plottingReference.R
									
									
									
									
									
								
							
							
						
						
									
										2868
									
								
								plottingReference.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										150
									
								
								scriptTemplate.R
									
									
									
									
									
								
							
							
						
						
									
										150
									
								
								scriptTemplate.R
									
									
									
									
									
								
							| @@ -1,75 +1,75 @@ | ||||
| # scriptTemplate.R | ||||
| # | ||||
| # Purpose: | ||||
| # Version: | ||||
| # Date: | ||||
| # Author: | ||||
| # | ||||
| # Input: | ||||
| # Output: | ||||
| # Dependencies: | ||||
| # | ||||
| # ToDo: | ||||
| # Notes: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| setwd("<your/project/directory>") | ||||
|  | ||||
| # ====  PARAMETERS  ============================================================ | ||||
| # Define and explain all parameters. No "magic numbers" in your code below. | ||||
|  | ||||
|  | ||||
|  | ||||
| # ====  PACKAGES  ============================================================== | ||||
| # Check that required packages have been installed. Install if needed. | ||||
|  | ||||
| if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = seqinr)       # basic information | ||||
| #  browseVignettes("seqinr")    # available vignettes | ||||
| #  data(package = "seqinr")     # available datasets | ||||
|  | ||||
| # Note: use package functions with the :: operator - eg. | ||||
| # seqinr::aaa("K") | ||||
|  | ||||
|  | ||||
|  | ||||
| # ====  FUNCTIONS  ============================================================= | ||||
|  | ||||
| # Define functions or source external files | ||||
| source("<myUtilityFunctionsScript.R>") | ||||
|  | ||||
| myFunction <- function(a, b=1) { | ||||
| 	# Purpose: | ||||
| 	#     Describe ... | ||||
| 	# Parameters: | ||||
| 	#     a: ... | ||||
| 	#     b: ... | ||||
| 	# Value: | ||||
| 	#     result: ... | ||||
|  | ||||
| 	# code ... | ||||
|  | ||||
| 	return(result) | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # ====  PROCESS  =============================================================== | ||||
| # Enter the step-by-step process of your project here. Strive to write your | ||||
| # code so that you can simply run this entire file and re-create all | ||||
| # intermediate results. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # ====  TESTS  ================================================================= | ||||
| # Enter your function tests here... | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # scriptTemplate.R | ||||
| # | ||||
| # Purpose: | ||||
| # Version: | ||||
| # Date: | ||||
| # Author: | ||||
| # | ||||
| # Input: | ||||
| # Output: | ||||
| # Dependencies: | ||||
| # | ||||
| # ToDo: | ||||
| # Notes: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
| setwd("<your/project/directory>") | ||||
|  | ||||
| # ====  PARAMETERS  ============================================================ | ||||
| # Define and explain all parameters. No "magic numbers" in your code below. | ||||
|  | ||||
|  | ||||
|  | ||||
| # ====  PACKAGES  ============================================================== | ||||
| # Check that required packages have been installed. Install if needed. | ||||
|  | ||||
| if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||
|   install.packages("seqinr") | ||||
| } | ||||
| # Package information: | ||||
| #  library(help = seqinr)       # basic information | ||||
| #  browseVignettes("seqinr")    # available vignettes | ||||
| #  data(package = "seqinr")     # available datasets | ||||
|  | ||||
| # Note: use package functions with the :: operator - eg. | ||||
| # seqinr::aaa("K") | ||||
|  | ||||
|  | ||||
|  | ||||
| # ====  FUNCTIONS  ============================================================= | ||||
|  | ||||
| # Define functions or source external files | ||||
| source("<myUtilityFunctionsScript.R>") | ||||
|  | ||||
| myFunction <- function(a, b=1) { | ||||
| 	# Purpose: | ||||
| 	#     Describe ... | ||||
| 	# Parameters: | ||||
| 	#     a: ... | ||||
| 	#     b: ... | ||||
| 	# Value: | ||||
| 	#     result: ... | ||||
|  | ||||
| 	# code ... | ||||
|  | ||||
| 	return(result) | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # ====  PROCESS  =============================================================== | ||||
| # Enter the step-by-step process of your project here. Strive to write your | ||||
| # code so that you can simply run this entire file and re-create all | ||||
| # intermediate results. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # ====  TESTS  ================================================================= | ||||
| # Enter your function tests here... | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,30 +1,30 @@ | ||||
| # ABC-createRefDB.R | ||||
| # | ||||
| # Create a reference protein database for Mbp1-like proteins | ||||
| # | ||||
| # Boris Steipe for ABC learning units | ||||
| # | ||||
| # For the species, see: | ||||
| # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi | ||||
| # | ||||
| # For the data model, see | ||||
| # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0 | ||||
| # For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| myDB <- dbInit() | ||||
|  | ||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json")) | ||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json")) | ||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json")) | ||||
|  | ||||
| myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json")) | ||||
|  | ||||
| myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json")) | ||||
|  | ||||
| myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json")) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # ABC-createRefDB.R | ||||
| # | ||||
| # Create a reference protein database for Mbp1-like proteins | ||||
| # | ||||
| # Boris Steipe for ABC learning units | ||||
| # | ||||
| # For the species, see: | ||||
| # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi | ||||
| # | ||||
| # For the data model, see | ||||
| # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0 | ||||
| # For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| myDB <- dbInit() | ||||
|  | ||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json")) | ||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json")) | ||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json")) | ||||
|  | ||||
| myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json")) | ||||
|  | ||||
| myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json")) | ||||
|  | ||||
| myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json")) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,443 +1,443 @@ | ||||
| # tocID <- "scripts/ABC-makeMYSPElist.R" | ||||
| # | ||||
| # Purpose:  Create a list of genome sequenced fungi with protein annotations and | ||||
| #               Mbp1 homologues. | ||||
| # | ||||
| # Version: 1.4 | ||||
| # | ||||
| # Date:    2016  09  -  2021  09 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions | ||||
| #          1.4    New retrieval logic | ||||
| #          1.3    Rewrite to change datasource. NCBI has not been updated | ||||
| #                   since 2012. Use ensembl fungi as initial source. | ||||
| #          1.2    Change from require() to requireNamespace() | ||||
| #          1.1.2  Moved BLAST.R to ./scripts directory | ||||
| #          1.1    Update 2017 | ||||
| #          1.0    First code 2016 | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # DO NOT  source()  THIS FILE! | ||||
| # | ||||
| # This file is code I provide for your deeper understanding of a process and | ||||
| # to provide you with useful sample code. It is not actually necessary for | ||||
| # you to run this code, but I encourage you to read it carefully and discuss | ||||
| # if there are parts you don't understand. | ||||
| # | ||||
| # Run the commands that interact with the NCBI servers only if you want to | ||||
| # experiment specifically with the code and/or parameters. I have commented out | ||||
| # those parts. If you only want to study the general workflow, just load() | ||||
| # the respective intermediate results. | ||||
| # | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                    Line | ||||
| #TOC> -------------------------------------------------------- | ||||
| #TOC>   1        The strategy                               55 | ||||
| #TOC>   2        PACKAGES AND INITIALIZATIONS               67 | ||||
| #TOC>   3        ENSEMBL FUNGI                              75 | ||||
| #TOC>   3.1        Import                                   78 | ||||
| #TOC>   4        BLAST SEARCH                              155 | ||||
| #TOC>   4.1        find homologous proteins                161 | ||||
| #TOC>   4.2        Identify species in "hits"              192 | ||||
| #TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282 | ||||
| #TOC>   6        STUDENT NUMBERS                           375 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  The strategy  ======================================================== | ||||
|  | ||||
| # This script will create a list of "MYSPE" species and save it in an R object | ||||
| # MYSPEspecies that is stored in the data subdirectory of this project from | ||||
| # where it can be loaded. The strategy is as follows: we download a list of | ||||
| # annotated fungal genomes from ensembl.fungi. All these are genome-sequenced | ||||
| # species that have been annotated. | ||||
| # Next we perform a BLAST search, to identify fungal species that have | ||||
| # genes that are homologous to yeast MBP1. | ||||
| # | ||||
| # ... | ||||
|  | ||||
| # =    2  PACKAGES AND INITIALIZATIONS  ======================================== | ||||
|  | ||||
| # httr provides interfaces to Webservers on the Internet | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    3  ENSEMBL FUNGI  ======================================================= | ||||
|  | ||||
|  | ||||
| # ==   3.1  Import  ============================================================ | ||||
|  | ||||
| # Navigate to https://fungi.ensembl.org and click on the link to the full | ||||
| # list of all species: https://fungi.ensembl.org/species.html | ||||
| # On the page, click on the spreadsheet symbol top right and choose | ||||
| # "download whole table". The file will be named  "Species.csv", in your | ||||
| # usual downloads folder. Move it to the data folder, and read it. | ||||
|  | ||||
| sDat <- read.csv("./data/Species.csv") | ||||
| str(sDat) | ||||
|  | ||||
| # The most obvious way to partition these is according to Classification ... | ||||
| # (poking around a bit in the UniProt taxonomy database shows that the | ||||
| #  classification used here is the taxonomic rank of "order"). | ||||
| # how many classifications do we have? | ||||
| length(unique(sDat$Classification))  # 66 | ||||
|  | ||||
| # To have a good set for the class, we should have about 100. | ||||
| # Let's see for which of these we can find Mbp1 homologues. | ||||
| # First, we'll keep only the colums for name, classification, and taxID, and | ||||
| # drop the rest ... | ||||
| sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")] | ||||
| colnames(sDat) <- c("name", "order", "taxID") | ||||
|  | ||||
| # Next, we make an extra column: genus - the first part of the binomial name. | ||||
| # We'll use the gsub() function, and for that we need a "regular expression" | ||||
| # that matches to all characters from the first blank to the end of the string: | ||||
| myPatt <- "\\s.*$"  # one whitespace (\\s) ... | ||||
|                     # followed by any character (.) 0..n times (*) ... | ||||
|                     # until the end of the string | ||||
|  | ||||
| # using gsub() we substitue all matching characters with the empty string "" - | ||||
| # this deletes the matching characters | ||||
| # Test this: | ||||
| gsub(myPatt, "", "Genus")                      # one word: unchanged | ||||
| gsub(myPatt, "", "gEnus species")              # two words: return only first | ||||
| gsub(myPatt, "", "geNus species strain 123")   # many words: return only first | ||||
|  | ||||
| # apply this to the "name" column and add the result as a separate column | ||||
| # called "genus" | ||||
| sDat$genus <- gsub(myPatt, "", sDat$name) | ||||
|  | ||||
| # what do we get? | ||||
| c(head(unique(sDat$genus)), | ||||
|   tail(unique(sDat$genus)))  # inspect the first and last few. Note that there | ||||
|                              # is a problem that we have to keep in mind. | ||||
|                              # (Always inspect your results!) | ||||
| # Drop all rows for which the genus contains special chracters - | ||||
| # like "[Candida]" | ||||
| sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ] | ||||
|  | ||||
| length(table(sDat$genus))    # how many genus? | ||||
| hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ... | ||||
|                                               # most genus have very few, but | ||||
|                                               # some have very many species. | ||||
| sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten... | ||||
|  | ||||
| # We should have at least one species from each taxonomic order, but we can | ||||
| # add a few genus until we have about 100 validated species. | ||||
|  | ||||
| # Let's add a column for species, by changing our regular expression a bit, | ||||
| # using ^ (start of string), \\S (NOT a whitespace), | ||||
| # and + (one or more matches), capturing the match (...), and returning | ||||
| # it as the substitution (\\1) ... | ||||
|  | ||||
| myPatt <- "^(\\S+\\s\\S+)\\s.*$" | ||||
| sDat$species <- gsub(myPatt, "\\1", sDat$name) | ||||
|  | ||||
| # And we reorder the columns, just for aesthetics: | ||||
| sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")] | ||||
|  | ||||
| # Final check: | ||||
| any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters | ||||
|  | ||||
| # | ||||
| # Now we check which of these have Mbp1 homologues ... | ||||
|  | ||||
| # =    4  BLAST SEARCH  ======================================================== | ||||
|  | ||||
|  | ||||
| # We run a BLAST search to find all proteins related to yeast Mbp1 in any | ||||
| # fungus. With the results, we'll annotate our sDat table. | ||||
|  | ||||
| # ==   4.1  find homologous proteins  ========================================== | ||||
| # | ||||
| # Use BLAST to fetch proteins related to Mbp1 and identify the species that | ||||
| # contain them. | ||||
|  | ||||
| # Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair | ||||
| # amount of error handling involved that is not supported by the API in a | ||||
| # principled way but requires rather ad hoc solutions. The code I threw together | ||||
| # to make a BLAST interface (demo-quality, not research-quality) is in the file | ||||
| # ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty | ||||
| # standard task of communicating with servers and parsing responses - everyday | ||||
| # fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST | ||||
| # parser in currently available packages. | ||||
| # | ||||
| # DON'T use this for BLAST searches unless you have read the NCBI policy | ||||
| # for automated tasks. If you indicriminately pound on the NCBI's BLAST | ||||
| # server, they will blacklist your IP-address. See: | ||||
| # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo | ||||
| # | ||||
| # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq | ||||
| # BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID | ||||
| #                    db = "refseq_protein",        # database to search in | ||||
| #                    nHits = 3000,                 # 945 hits in 2020 | ||||
| #                    E = 0.01,                     # | ||||
| #                    limits = "txid4751[ORGN]")    # = fungi | ||||
| # saveRDS(BLASThits, file="data/BLASThits.rds") | ||||
| # | ||||
| # NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory | ||||
| # | ||||
| BLASThits <- readRDS(file = "data/BLASThits.rds") | ||||
|  | ||||
| # ==   4.2  Identify species in "hits"  ======================================== | ||||
|  | ||||
| # This is a very big list that can't be usefully analyzed manually. Here | ||||
| # we are only interested in the species names that it contains. | ||||
|  | ||||
| # How many hits in the list? | ||||
| length(BLASThits$hits)      # 1,134 | ||||
|  | ||||
| # Let's look at a hit somewhere down the list | ||||
| str(BLASThits$hit[[277]]) | ||||
|  | ||||
| # A fair amount of parsing has gone into the BLAST.R code to prepare the results | ||||
| # in a useful way. The species information is in the $species element of every | ||||
| # hit. | ||||
|  | ||||
| # Run a loop to extract all the species names into a vector. We subset ... | ||||
| # Blasthits$hits                 ... the list of hits, from which we choose ... | ||||
| # Blasthits$hits[[i]]            ... the i-th hit, and get ... | ||||
| # Blasthits$hits[[i]]$species    ... the species element from that. | ||||
| # Subsetting FTW. | ||||
|  | ||||
| BLASTspecies <- character() | ||||
| for (i in seq_along(BLASThits$hits)) { | ||||
|     BLASTspecies[i] <- BLASThits$hits[[i]]$species | ||||
| } | ||||
|  | ||||
| # You can confirm that BLASTspecies has the expected size. | ||||
| length(BLASTspecies) | ||||
|  | ||||
| # if we delete some of these later on, we still want to remember which hit | ||||
| # they came from. Thus we name() the elements with their index, which is the | ||||
| # same as the index of the hit in BLASThits | ||||
| names(BLASTspecies) <- 1:length(BLASTspecies) | ||||
|  | ||||
|  | ||||
| # let's plot the distribution of E-values | ||||
| eVals <- numeric() | ||||
| for (i in seq_along(BLASThits$hits)) { | ||||
|   eVals[i] <- BLASThits$hits[[i]]$E | ||||
| } | ||||
| range(eVals) | ||||
| sum(eVals == 0) | ||||
|  | ||||
| # let's plot the log of all values > 0 to see how they are distributed | ||||
| # plotting only one vectyor of numbers plots their index as x, and | ||||
| # their value as y ... | ||||
| plot(log(eVals[eVals > 0]), col = "#CC0000") | ||||
|  | ||||
| # This is very informative: I would suspect that the first ten or so are | ||||
| # virtually identical to the yeast protein, then we have about 800 hits with | ||||
| # decreasing similarity, and then about 200 more that may actually be false | ||||
| # positives. Also - we plotted them by index, that means the table is SORTED: | ||||
| # Lower E-values strictly come before higher E-values. | ||||
|  | ||||
| # Again, some species appear more than once, e.g. ... | ||||
| sum(BLASTspecies == "Saccharomyces cerevisiae") | ||||
|  | ||||
| # ... corresponding to the five homologous gene sequences (paralogues) of yeast. | ||||
|  | ||||
| # Therefore we remove duplicates. Removing duplicates will leave the FIRST | ||||
| # in a list alone, and only remove the SUBSEQUENT ones. Which means, from each | ||||
| # species, we will retain only the protein that has the highest similarity | ||||
| # to yeast Mbp1, not any of its more distant paralogues. | ||||
| sel <- ! duplicated(BLASTspecies) | ||||
| BLASTspecies <- BLASTspecies[sel] | ||||
|  | ||||
| length(BLASTspecies) | ||||
| # i.e. we got rid of about two thirds of the hits. | ||||
| tail(BLASTspecies)  # see how the names are useful! | ||||
|                     # again - there are some special characters ... | ||||
|                     # what are they? | ||||
| BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)] | ||||
|  | ||||
| # remove the brackets ... | ||||
| BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies) | ||||
| # drop any new duplicates ... | ||||
| BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)] | ||||
|  | ||||
| # check the number again: | ||||
| length(BLASTspecies) | ||||
| # Think a bit about this: what may be the biological reason to find that | ||||
| # on average, in 388 fungi across the entire phylogenetic tree, we have | ||||
| # three sequences that are homologous to yeast Mbp1? | ||||
|  | ||||
| # Let's look at the distribution of E-values in this selection (Subsetting FTW): | ||||
| # we plot all values that are TRUE in the vector "sel" that we created above, | ||||
| # AND greater than 0 | ||||
| plot(log(eVals[sel & eVals > 0]), col = "#00CC00") | ||||
|  | ||||
|  | ||||
| # =    5  MERGE ENSEMBL AND BLAST RESULTS  ===================================== | ||||
|  | ||||
| # Next we add the blast result to our sDat dataframe. We'll store the index, | ||||
| # the E-value, and the Query-bounds from which we can estimate which domains | ||||
| # of Mbp1 are actually covered by the hit. (True orthologues MUST align with | ||||
| # Mbp1's N-terminal APSES domain.) | ||||
| # | ||||
| # First we pull the hits we wanted from the BLASTspecies: | ||||
| iHits <- as.numeric(names(BLASTspecies)) | ||||
| length(iHits)     # one index for each TRUE in sel | ||||
|  | ||||
| # add columns to sDat | ||||
| l <- nrow(sDat) | ||||
| sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results | ||||
| sDat$eVal   <- numeric(l)  # E-value of the hit | ||||
| sDat$lAli   <- numeric(l)  # length of the aligned region | ||||
|  | ||||
| # extract and merge | ||||
| for (iHit in iHits) { | ||||
|   thisSp <- BLASThits$hits[[iHit]]$species | ||||
|   sel <- sDat$species == thisSp | ||||
|  | ||||
|   sDat$iHit[sel]   <- iHit | ||||
|   sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E | ||||
|   sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli | ||||
| } | ||||
|  | ||||
| # Are all reference species accounted for? | ||||
| selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit | ||||
| REFspecies %in% sDat$species[selA]     # yes, all there | ||||
|  | ||||
| selB <- sDat$species %in% REFspecies   # all rows which have one of REF species | ||||
|  | ||||
| sum(selA & selB)   # How many rows? | ||||
|  | ||||
| # sDat of course includes all duplicates. Some may be multiply sequenced, some | ||||
| # may be different strains. We'll use the same strategy as before and keep | ||||
| # only the best hit: order the rows by E-value, then drop all rows which | ||||
| # are duplicated. | ||||
|  | ||||
|  | ||||
| # drop all rows without BLAST hits ... | ||||
| sDat <- sDat[ ! (sDat$iHit == 0) , ] | ||||
|  | ||||
| # order sDat by E-value ... | ||||
| sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ] | ||||
|  | ||||
| # drop all rows with duplicated species ... | ||||
| sDat <- sDat[ ! duplicated(sDat$species) , ] | ||||
|  | ||||
| # Lets look at the E-values ... | ||||
| plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00") | ||||
|  | ||||
| # and alignment lengths ... | ||||
| plot(sDat$lAli, col = "#00DDAA") | ||||
|  | ||||
| # How many ... | ||||
| length(unique(sDat$name)) | ||||
| length(unique(sDat$species)) | ||||
| length(unique(sDat$genus)) | ||||
| length(unique(sDat$order)) | ||||
|  | ||||
| # I need an extra species for admin purposes later on ... | ||||
| sel <- grep("Sporothrix schenckii", sDat$species) | ||||
| SPOSCdat <- sDat[sel, ] | ||||
| sDat <- sDat[-sel, ] | ||||
|  | ||||
| # To get the final dataset, we remove the reference species with their | ||||
| # entire orders ... | ||||
| REForders <- unique(sDat$order[sDat$species %in% REFspecies]) | ||||
| sel <- sDat$order %in% REForders | ||||
| REFdat <- sDat[sel , ] | ||||
| sDat   <- sDat[ ! sel , ] | ||||
|  | ||||
| # REFdat should now contain only the REFspecies ... | ||||
| ( REFdat <- REFdat[REFdat$species %in% REFspecies , ] ) | ||||
|  | ||||
| # ... but all of them | ||||
| sum(REFspecies %in% REFdat$species) | ||||
|  | ||||
| # ... and we have enough left in sDat to prune sDat to unique genus | ||||
| sDat <- sDat[ ! duplicated(sDat$genus) , ] | ||||
| nrow(sDat)   # 84 | ||||
|  | ||||
| # I add back "Sporothrix schenckii" ... | ||||
| sDat <- rbind(SPOSCdat, sDat) | ||||
|  | ||||
| # ... and save for future use. | ||||
| # saveRDS(sDat, file = "data/sDat.rds") | ||||
| # saveRDS(REFdat, file = "data/REFdat.rds") | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    6  STUDENT NUMBERS  ===================================================== | ||||
| # | ||||
| # An asymmetric function to retrieve a MYSPE species | ||||
| # | ||||
| sDat <- readRDS(file = "data/sDat.rds") | ||||
|  | ||||
| students <- read.csv("../BCH441-2021-students.csv") | ||||
| sN <- students$Integration.ID | ||||
| sN <- sN[! is.na(sN)] | ||||
| sN <- as.character(sN) | ||||
| sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii" | ||||
|  | ||||
| set.seed(112358) | ||||
| theseSpecies <- sDat[sample(1:nrow(sDat)), ] | ||||
| all(sort(theseSpecies$name) == sort(sDat$name)) | ||||
| nrow((theseSpecies)) | ||||
| (iX <- grep("Sporothrix schenckii", theseSpecies$name)) | ||||
| theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ]) | ||||
| rndMin <-  992000000 | ||||
| rndMax <- 1020000000 | ||||
| N <- 10000 | ||||
| keys <- as.character(sample(rndMin:rndMax, N + 1000)) | ||||
| keys <- keys[! (keys %in% sN)] | ||||
| keys <- keys[1:N] | ||||
| keys[1:length(sN)] <- sN | ||||
|  | ||||
| nRep <- floor(N/nrow(theseSpecies)) | ||||
| MYSPEdat <- theseSpecies | ||||
| for(i in 1:nRep) { | ||||
|   MYSPEdat <- rbind(MYSPEdat, theseSpecies) | ||||
| } | ||||
| MYSPEdat <- MYSPEdat[1:N, ] | ||||
| for (i in 1:N) { | ||||
|   rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5") | ||||
| } | ||||
| set.seed(NULL) | ||||
| MYSPEdat <- MYSPEdat[sample(1:N), ] | ||||
|  | ||||
| # saveRDS(MYSPEdat, file = "data/MYSPEdat.rds") | ||||
|  | ||||
| # === validate | ||||
| x <- character() | ||||
| for (n in sN) { | ||||
|   sp <- getMYSPE(n) | ||||
|   if (length(sp) != 1) { | ||||
|     stop(print(as.character(n))) | ||||
|   } else { | ||||
|     x <- c(x, sp) | ||||
|   } | ||||
| } | ||||
|  | ||||
| # === species for late-comers | ||||
| y <- unique(MYSPEdat$species) | ||||
| print(y[!(y %in% x)]) | ||||
|  | ||||
|  | ||||
| # === validate | ||||
| l <- length(sN) | ||||
| sp <- character(l) | ||||
| for(i in 1:l) { | ||||
|   sp[i] <- getMYSPE(sN[i]) | ||||
| } | ||||
| any(duplicated(sp)) | ||||
| length(unique(sp)) | ||||
| which(! sDat$species %in% sp)  # these can be assigned to late-comers | ||||
|  | ||||
| # Done. | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "scripts/ABC-makeMYSPElist.R" | ||||
| # | ||||
| # Purpose:  Create a list of genome sequenced fungi with protein annotations and | ||||
| #               Mbp1 homologues. | ||||
| # | ||||
| # Version: 1.4 | ||||
| # | ||||
| # Date:    2016  09  -  2021  09 | ||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions | ||||
| #          1.4    New retrieval logic | ||||
| #          1.3    Rewrite to change datasource. NCBI has not been updated | ||||
| #                   since 2012. Use ensembl fungi as initial source. | ||||
| #          1.2    Change from require() to requireNamespace() | ||||
| #          1.1.2  Moved BLAST.R to ./scripts directory | ||||
| #          1.1    Update 2017 | ||||
| #          1.0    First code 2016 | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # ============================================================================== | ||||
| # | ||||
| # DO NOT  source()  THIS FILE! | ||||
| # | ||||
| # This file is code I provide for your deeper understanding of a process and | ||||
| # to provide you with useful sample code. It is not actually necessary for | ||||
| # you to run this code, but I encourage you to read it carefully and discuss | ||||
| # if there are parts you don't understand. | ||||
| # | ||||
| # Run the commands that interact with the NCBI servers only if you want to | ||||
| # experiment specifically with the code and/or parameters. I have commented out | ||||
| # those parts. If you only want to study the general workflow, just load() | ||||
| # the respective intermediate results. | ||||
| # | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                    Line | ||||
| #TOC> -------------------------------------------------------- | ||||
| #TOC>   1        The strategy                               55 | ||||
| #TOC>   2        PACKAGES AND INITIALIZATIONS               67 | ||||
| #TOC>   3        ENSEMBL FUNGI                              75 | ||||
| #TOC>   3.1        Import                                   78 | ||||
| #TOC>   4        BLAST SEARCH                              155 | ||||
| #TOC>   4.1        find homologous proteins                161 | ||||
| #TOC>   4.2        Identify species in "hits"              192 | ||||
| #TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282 | ||||
| #TOC>   6        STUDENT NUMBERS                           375 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  The strategy  ======================================================== | ||||
|  | ||||
| # This script will create a list of "MYSPE" species and save it in an R object | ||||
| # MYSPEspecies that is stored in the data subdirectory of this project from | ||||
| # where it can be loaded. The strategy is as follows: we download a list of | ||||
| # annotated fungal genomes from ensembl.fungi. All these are genome-sequenced | ||||
| # species that have been annotated. | ||||
| # Next we perform a BLAST search, to identify fungal species that have | ||||
| # genes that are homologous to yeast MBP1. | ||||
| # | ||||
| # ... | ||||
|  | ||||
| # =    2  PACKAGES AND INITIALIZATIONS  ======================================== | ||||
|  | ||||
| # httr provides interfaces to Webservers on the Internet | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    3  ENSEMBL FUNGI  ======================================================= | ||||
|  | ||||
|  | ||||
| # ==   3.1  Import  ============================================================ | ||||
|  | ||||
| # Navigate to https://fungi.ensembl.org and click on the link to the full | ||||
| # list of all species: https://fungi.ensembl.org/species.html | ||||
| # On the page, click on the spreadsheet symbol top right and choose | ||||
| # "download whole table". The file will be named  "Species.csv", in your | ||||
| # usual downloads folder. Move it to the data folder, and read it. | ||||
|  | ||||
| sDat <- read.csv("./data/Species.csv") | ||||
| str(sDat) | ||||
|  | ||||
| # The most obvious way to partition these is according to Classification ... | ||||
| # (poking around a bit in the UniProt taxonomy database shows that the | ||||
| #  classification used here is the taxonomic rank of "order"). | ||||
| # how many classifications do we have? | ||||
| length(unique(sDat$Classification))  # 66 | ||||
|  | ||||
| # To have a good set for the class, we should have about 100. | ||||
| # Let's see for which of these we can find Mbp1 homologues. | ||||
| # First, we'll keep only the colums for name, classification, and taxID, and | ||||
| # drop the rest ... | ||||
| sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")] | ||||
| colnames(sDat) <- c("name", "order", "taxID") | ||||
|  | ||||
| # Next, we make an extra column: genus - the first part of the binomial name. | ||||
| # We'll use the gsub() function, and for that we need a "regular expression" | ||||
| # that matches to all characters from the first blank to the end of the string: | ||||
| myPatt <- "\\s.*$"  # one whitespace (\\s) ... | ||||
|                     # followed by any character (.) 0..n times (*) ... | ||||
|                     # until the end of the string | ||||
|  | ||||
| # using gsub() we substitue all matching characters with the empty string "" - | ||||
| # this deletes the matching characters | ||||
| # Test this: | ||||
| gsub(myPatt, "", "Genus")                      # one word: unchanged | ||||
| gsub(myPatt, "", "gEnus species")              # two words: return only first | ||||
| gsub(myPatt, "", "geNus species strain 123")   # many words: return only first | ||||
|  | ||||
| # apply this to the "name" column and add the result as a separate column | ||||
| # called "genus" | ||||
| sDat$genus <- gsub(myPatt, "", sDat$name) | ||||
|  | ||||
| # what do we get? | ||||
| c(head(unique(sDat$genus)), | ||||
|   tail(unique(sDat$genus)))  # inspect the first and last few. Note that there | ||||
|                              # is a problem that we have to keep in mind. | ||||
|                              # (Always inspect your results!) | ||||
| # Drop all rows for which the genus contains special chracters - | ||||
| # like "[Candida]" | ||||
| sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ] | ||||
|  | ||||
| length(table(sDat$genus))    # how many genus? | ||||
| hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ... | ||||
|                                               # most genus have very few, but | ||||
|                                               # some have very many species. | ||||
| sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten... | ||||
|  | ||||
| # We should have at least one species from each taxonomic order, but we can | ||||
| # add a few genus until we have about 100 validated species. | ||||
|  | ||||
| # Let's add a column for species, by changing our regular expression a bit, | ||||
| # using ^ (start of string), \\S (NOT a whitespace), | ||||
| # and + (one or more matches), capturing the match (...), and returning | ||||
| # it as the substitution (\\1) ... | ||||
|  | ||||
| myPatt <- "^(\\S+\\s\\S+)\\s.*$" | ||||
| sDat$species <- gsub(myPatt, "\\1", sDat$name) | ||||
|  | ||||
| # And we reorder the columns, just for aesthetics: | ||||
| sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")] | ||||
|  | ||||
| # Final check: | ||||
| any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters | ||||
|  | ||||
| # | ||||
| # Now we check which of these have Mbp1 homologues ... | ||||
|  | ||||
| # =    4  BLAST SEARCH  ======================================================== | ||||
|  | ||||
|  | ||||
| # We run a BLAST search to find all proteins related to yeast Mbp1 in any | ||||
| # fungus. With the results, we'll annotate our sDat table. | ||||
|  | ||||
| # ==   4.1  find homologous proteins  ========================================== | ||||
| # | ||||
| # Use BLAST to fetch proteins related to Mbp1 and identify the species that | ||||
| # contain them. | ||||
|  | ||||
| # Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair | ||||
| # amount of error handling involved that is not supported by the API in a | ||||
| # principled way but requires rather ad hoc solutions. The code I threw together | ||||
| # to make a BLAST interface (demo-quality, not research-quality) is in the file | ||||
| # ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty | ||||
| # standard task of communicating with servers and parsing responses - everyday | ||||
| # fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST | ||||
| # parser in currently available packages. | ||||
| # | ||||
| # DON'T use this for BLAST searches unless you have read the NCBI policy | ||||
| # for automated tasks. If you indicriminately pound on the NCBI's BLAST | ||||
| # server, they will blacklist your IP-address. See: | ||||
| # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo | ||||
| # | ||||
| # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq | ||||
| # BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID | ||||
| #                    db = "refseq_protein",        # database to search in | ||||
| #                    nHits = 3000,                 # 945 hits in 2020 | ||||
| #                    E = 0.01,                     # | ||||
| #                    limits = "txid4751[ORGN]")    # = fungi | ||||
| # saveRDS(BLASThits, file="data/BLASThits.rds") | ||||
| # | ||||
| # NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory | ||||
| # | ||||
| BLASThits <- readRDS(file = "data/BLASThits.rds") | ||||
|  | ||||
| # ==   4.2  Identify species in "hits"  ======================================== | ||||
|  | ||||
| # This is a very big list that can't be usefully analyzed manually. Here | ||||
| # we are only interested in the species names that it contains. | ||||
|  | ||||
| # How many hits in the list? | ||||
| length(BLASThits$hits)      # 1,134 | ||||
|  | ||||
| # Let's look at a hit somewhere down the list | ||||
| str(BLASThits$hit[[277]]) | ||||
|  | ||||
| # A fair amount of parsing has gone into the BLAST.R code to prepare the results | ||||
| # in a useful way. The species information is in the $species element of every | ||||
| # hit. | ||||
|  | ||||
| # Run a loop to extract all the species names into a vector. We subset ... | ||||
| # Blasthits$hits                 ... the list of hits, from which we choose ... | ||||
| # Blasthits$hits[[i]]            ... the i-th hit, and get ... | ||||
| # Blasthits$hits[[i]]$species    ... the species element from that. | ||||
| # Subsetting FTW. | ||||
|  | ||||
| BLASTspecies <- character() | ||||
| for (i in seq_along(BLASThits$hits)) { | ||||
|     BLASTspecies[i] <- BLASThits$hits[[i]]$species | ||||
| } | ||||
|  | ||||
| # You can confirm that BLASTspecies has the expected size. | ||||
| length(BLASTspecies) | ||||
|  | ||||
| # if we delete some of these later on, we still want to remember which hit | ||||
| # they came from. Thus we name() the elements with their index, which is the | ||||
| # same as the index of the hit in BLASThits | ||||
| names(BLASTspecies) <- 1:length(BLASTspecies) | ||||
|  | ||||
|  | ||||
| # let's plot the distribution of E-values | ||||
| eVals <- numeric() | ||||
| for (i in seq_along(BLASThits$hits)) { | ||||
|   eVals[i] <- BLASThits$hits[[i]]$E | ||||
| } | ||||
| range(eVals) | ||||
| sum(eVals == 0) | ||||
|  | ||||
| # let's plot the log of all values > 0 to see how they are distributed | ||||
| # plotting only one vectyor of numbers plots their index as x, and | ||||
| # their value as y ... | ||||
| plot(log(eVals[eVals > 0]), col = "#CC0000") | ||||
|  | ||||
| # This is very informative: I would suspect that the first ten or so are | ||||
| # virtually identical to the yeast protein, then we have about 800 hits with | ||||
| # decreasing similarity, and then about 200 more that may actually be false | ||||
| # positives. Also - we plotted them by index, that means the table is SORTED: | ||||
| # Lower E-values strictly come before higher E-values. | ||||
|  | ||||
| # Again, some species appear more than once, e.g. ... | ||||
| sum(BLASTspecies == "Saccharomyces cerevisiae") | ||||
|  | ||||
| # ... corresponding to the five homologous gene sequences (paralogues) of yeast. | ||||
|  | ||||
| # Therefore we remove duplicates. Removing duplicates will leave the FIRST | ||||
| # in a list alone, and only remove the SUBSEQUENT ones. Which means, from each | ||||
| # species, we will retain only the protein that has the highest similarity | ||||
| # to yeast Mbp1, not any of its more distant paralogues. | ||||
| sel <- ! duplicated(BLASTspecies) | ||||
| BLASTspecies <- BLASTspecies[sel] | ||||
|  | ||||
| length(BLASTspecies) | ||||
| # i.e. we got rid of about two thirds of the hits. | ||||
| tail(BLASTspecies)  # see how the names are useful! | ||||
|                     # again - there are some special characters ... | ||||
|                     # what are they? | ||||
| BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)] | ||||
|  | ||||
| # remove the brackets ... | ||||
| BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies) | ||||
| # drop any new duplicates ... | ||||
| BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)] | ||||
|  | ||||
| # check the number again: | ||||
| length(BLASTspecies) | ||||
| # Think a bit about this: what may be the biological reason to find that | ||||
| # on average, in 388 fungi across the entire phylogenetic tree, we have | ||||
| # three sequences that are homologous to yeast Mbp1? | ||||
|  | ||||
| # Let's look at the distribution of E-values in this selection (Subsetting FTW): | ||||
| # we plot all values that are TRUE in the vector "sel" that we created above, | ||||
| # AND greater than 0 | ||||
| plot(log(eVals[sel & eVals > 0]), col = "#00CC00") | ||||
|  | ||||
|  | ||||
| # =    5  MERGE ENSEMBL AND BLAST RESULTS  ===================================== | ||||
|  | ||||
| # Next we add the blast result to our sDat dataframe. We'll store the index, | ||||
| # the E-value, and the Query-bounds from which we can estimate which domains | ||||
| # of Mbp1 are actually covered by the hit. (True orthologues MUST align with | ||||
| # Mbp1's N-terminal APSES domain.) | ||||
| # | ||||
| # First we pull the hits we wanted from the BLASTspecies: | ||||
| iHits <- as.numeric(names(BLASTspecies)) | ||||
| length(iHits)     # one index for each TRUE in sel | ||||
|  | ||||
| # add columns to sDat | ||||
| l <- nrow(sDat) | ||||
| sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results | ||||
| sDat$eVal   <- numeric(l)  # E-value of the hit | ||||
| sDat$lAli   <- numeric(l)  # length of the aligned region | ||||
|  | ||||
| # extract and merge | ||||
| for (iHit in iHits) { | ||||
|   thisSp <- BLASThits$hits[[iHit]]$species | ||||
|   sel <- sDat$species == thisSp | ||||
|  | ||||
|   sDat$iHit[sel]   <- iHit | ||||
|   sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E | ||||
|   sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli | ||||
| } | ||||
|  | ||||
| # Are all reference species accounted for? | ||||
| selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit | ||||
| REFspecies %in% sDat$species[selA]     # yes, all there | ||||
|  | ||||
| selB <- sDat$species %in% REFspecies   # all rows which have one of REF species | ||||
|  | ||||
| sum(selA & selB)   # How many rows? | ||||
|  | ||||
| # sDat of course includes all duplicates. Some may be multiply sequenced, some | ||||
| # may be different strains. We'll use the same strategy as before and keep | ||||
| # only the best hit: order the rows by E-value, then drop all rows which | ||||
| # are duplicated. | ||||
|  | ||||
|  | ||||
| # drop all rows without BLAST hits ... | ||||
| sDat <- sDat[ ! (sDat$iHit == 0) , ] | ||||
|  | ||||
| # order sDat by E-value ... | ||||
| sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ] | ||||
|  | ||||
| # drop all rows with duplicated species ... | ||||
| sDat <- sDat[ ! duplicated(sDat$species) , ] | ||||
|  | ||||
| # Lets look at the E-values ... | ||||
| plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00") | ||||
|  | ||||
| # and alignment lengths ... | ||||
| plot(sDat$lAli, col = "#00DDAA") | ||||
|  | ||||
| # How many ... | ||||
| length(unique(sDat$name)) | ||||
| length(unique(sDat$species)) | ||||
| length(unique(sDat$genus)) | ||||
| length(unique(sDat$order)) | ||||
|  | ||||
| # I need an extra species for admin purposes later on ... | ||||
| sel <- grep("Sporothrix schenckii", sDat$species) | ||||
| SPOSCdat <- sDat[sel, ] | ||||
| sDat <- sDat[-sel, ] | ||||
|  | ||||
| # To get the final dataset, we remove the reference species with their | ||||
| # entire orders ... | ||||
| REForders <- unique(sDat$order[sDat$species %in% REFspecies]) | ||||
| sel <- sDat$order %in% REForders | ||||
| REFdat <- sDat[sel , ] | ||||
| sDat   <- sDat[ ! sel , ] | ||||
|  | ||||
| # REFdat should now contain only the REFspecies ... | ||||
| ( REFdat <- REFdat[REFdat$species %in% REFspecies , ] ) | ||||
|  | ||||
| # ... but all of them | ||||
| sum(REFspecies %in% REFdat$species) | ||||
|  | ||||
| # ... and we have enough left in sDat to prune sDat to unique genus | ||||
| sDat <- sDat[ ! duplicated(sDat$genus) , ] | ||||
| nrow(sDat)   # 84 | ||||
|  | ||||
| # I add back "Sporothrix schenckii" ... | ||||
| sDat <- rbind(SPOSCdat, sDat) | ||||
|  | ||||
| # ... and save for future use. | ||||
| # saveRDS(sDat, file = "data/sDat.rds") | ||||
| # saveRDS(REFdat, file = "data/REFdat.rds") | ||||
|  | ||||
|  | ||||
|  | ||||
| # =    6  STUDENT NUMBERS  ===================================================== | ||||
| # | ||||
| # An asymmetric function to retrieve a MYSPE species | ||||
| # | ||||
| sDat <- readRDS(file = "data/sDat.rds") | ||||
|  | ||||
| students <- read.csv("../BCH441-2021-students.csv") | ||||
| sN <- students$Integration.ID | ||||
| sN <- sN[! is.na(sN)] | ||||
| sN <- as.character(sN) | ||||
| sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii" | ||||
|  | ||||
| set.seed(112358) | ||||
| theseSpecies <- sDat[sample(1:nrow(sDat)), ] | ||||
| all(sort(theseSpecies$name) == sort(sDat$name)) | ||||
| nrow((theseSpecies)) | ||||
| (iX <- grep("Sporothrix schenckii", theseSpecies$name)) | ||||
| theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ]) | ||||
| rndMin <-  992000000 | ||||
| rndMax <- 1020000000 | ||||
| N <- 10000 | ||||
| keys <- as.character(sample(rndMin:rndMax, N + 1000)) | ||||
| keys <- keys[! (keys %in% sN)] | ||||
| keys <- keys[1:N] | ||||
| keys[1:length(sN)] <- sN | ||||
|  | ||||
| nRep <- floor(N/nrow(theseSpecies)) | ||||
| MYSPEdat <- theseSpecies | ||||
| for(i in 1:nRep) { | ||||
|   MYSPEdat <- rbind(MYSPEdat, theseSpecies) | ||||
| } | ||||
| MYSPEdat <- MYSPEdat[1:N, ] | ||||
| for (i in 1:N) { | ||||
|   rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5") | ||||
| } | ||||
| set.seed(NULL) | ||||
| MYSPEdat <- MYSPEdat[sample(1:N), ] | ||||
|  | ||||
| # saveRDS(MYSPEdat, file = "data/MYSPEdat.rds") | ||||
|  | ||||
| # === validate | ||||
| x <- character() | ||||
| for (n in sN) { | ||||
|   sp <- getMYSPE(n) | ||||
|   if (length(sp) != 1) { | ||||
|     stop(print(as.character(n))) | ||||
|   } else { | ||||
|     x <- c(x, sp) | ||||
|   } | ||||
| } | ||||
|  | ||||
| # === species for late-comers | ||||
| y <- unique(MYSPEdat$species) | ||||
| print(y[!(y %in% x)]) | ||||
|  | ||||
|  | ||||
| # === validate | ||||
| l <- length(sN) | ||||
| sp <- character(l) | ||||
| for(i in 1:l) { | ||||
|   sp[i] <- getMYSPE(sN[i]) | ||||
| } | ||||
| any(duplicated(sp)) | ||||
| length(unique(sp)) | ||||
| which(! sDat$species %in% sp)  # these can be assigned to late-comers | ||||
|  | ||||
| # Done. | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,168 +1,168 @@ | ||||
| # tocID <- "scripts/ABC-makeSTRINGedges.R" | ||||
| # | ||||
| # Create a subnetwork of high-confidence human STRING edges. | ||||
| # | ||||
| # Notes: | ||||
| # | ||||
| #      The large source- datafile is NOT posted to github. If you want to | ||||
| #      experiment with the original data, download it and place it into your | ||||
| #      local  ./data  directory. | ||||
| # | ||||
| #      STRING data source: | ||||
| #        Download page: | ||||
| # https://string-db.org/cgi/download.pl?species_text=Homo+sapiens | ||||
| #        Data: (127.6 Mb) | ||||
| # https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz | ||||
| # | ||||
| # Version:  1.0 | ||||
| # | ||||
| # Date:     2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.0    Rewrite | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                             Line | ||||
| #TOC> ------------------------------------------------- | ||||
| #TOC>   1        Initialize                          44 | ||||
| #TOC>   2        Read STRING Data                    51 | ||||
| #TOC>   3        Define cutoff and subset            63 | ||||
| #TOC>   4        Drop  duplicates                   103 | ||||
| #TOC>   5        Simple statistics                  127 | ||||
| #TOC>   6        Write to file                      160 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Initialize  ========================================================== | ||||
|  | ||||
| if (! requireNamespace("readr", quietly = TRUE)) { | ||||
|   install.packages("readr") | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    2  Read STRING Data  ==================================================== | ||||
|  | ||||
| # Read STRING Data (needs to be downloaded from database, see URL in Notes) | ||||
| # The .gz compressed version is 127.6MB, the uncompressed version is probably | ||||
| # 848 Mb. Fortunately readr:: can read from compressed | ||||
| # files, and does so automatically, based on the file extension. | ||||
| ( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") ) | ||||
| STR <- readr::read_delim(fn, delim = " ") | ||||
| nrow(STR)  #  11,759,454 rows | ||||
| head(STR) | ||||
|  | ||||
|  | ||||
| # =    3  Define cutoff and subset  ============================================ | ||||
|  | ||||
| # approximate distribution of combined_score | ||||
| hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF") | ||||
|  | ||||
| # Let's table the counts >= 850 and plot them for better resolution. | ||||
|  | ||||
| myTb <- table(STR$combined_score[STR$combined_score >= 850]) | ||||
| is.unsorted(as.integer(names(myTb)))  # Good - they are all in order | ||||
|  | ||||
| plot(myTb, type = "b", cex = 0.5, col = "#BB0000") | ||||
| myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that | ||||
|                          # frequently assigns a combined score of 0.900 | ||||
|  | ||||
| # Let's plot these counts as cumulative sums, in reverse order, scaled | ||||
| # as combined scores. | ||||
| myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing | ||||
| plot(myX, | ||||
|      cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing | ||||
|      xlim = c(1.0, 0.85),            # reverse x-axis | ||||
|      type = "l", | ||||
|      main = "STRING interactions for 9606 (top 600,000)", | ||||
|      xlab = "combined_score", | ||||
|      ylab = "cumulative counts", | ||||
|      col = "#CC0000") | ||||
| abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF") | ||||
|  | ||||
| # What's the cutoff for 100,000 edges? | ||||
| which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964 | ||||
|  | ||||
| # confirm | ||||
| sum(STR$combined_score >= 964) # 101,348 | ||||
| abline(v = 0.964, lwd = 0.5, col = "#DDDDFF") | ||||
|  | ||||
| # subset the table, and use only the protein IDs and the combined_score | ||||
| STR <- STR[STR$combined_score >= 964, | ||||
|             c("protein1", "protein2", "combined_score")] | ||||
| colnames(STR) <- c("a", "b", "score") | ||||
|  | ||||
|  | ||||
| # =    4  Drop  duplicates  ==================================================== | ||||
|  | ||||
| # identify duplicate interactions by creating keys in a defined alphabetical | ||||
| # sort order, then checking for  duplicated(). | ||||
| # e.g  if we have (X:U, U:X), we change U:X to X:U and now find that | ||||
| # (X:U, X:U) has a duplicate. | ||||
|  | ||||
| AB <- STR$a < STR$b        # logical vector: genes we need to swap | ||||
| tmp <- STR$b               # copy column b | ||||
| STR$b[AB] <- STR$a[AB]     # copy a's into b | ||||
| STR$a[AB] <- tmp[AB]       # copy tmp's into a | ||||
| all(STR$a >= STR$b)        # confirm: TRUE | ||||
|  | ||||
| # now, make combined keys, like this: | ||||
| paste0(STR$a[1:10], ":", STR$b[1:10]) | ||||
|  | ||||
| tmp <- paste0(STR$a, ":", STR$b) | ||||
| sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports | ||||
|                      # both a:b and b:a ! | ||||
|  | ||||
| # drop all duplicated interactions from tmp | ||||
| STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain | ||||
|  | ||||
|  | ||||
| # =    5  Simple statistics  =================================================== | ||||
|  | ||||
| # how many unique genes? | ||||
| length(unique(c(STR$a, STR$b)))   # 8,445 | ||||
|  | ||||
| # how many self-edges? | ||||
| sum(STR$a == STR$b)  # none | ||||
|  | ||||
| # log(rank) / log(frequency) | ||||
| myTbl <- table(c(STR$a, STR$b)) | ||||
| myTbl <- myTbl[order(myTbl, decreasing = TRUE)] | ||||
|  | ||||
| hist(myTbl, breaks = 40, col = "#FFEEBB") | ||||
|  | ||||
| # number of singletons | ||||
| sum(myTbl == 1) # almost a quarter | ||||
|  | ||||
| # maximum? | ||||
| myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465 | ||||
|                                    # Google: CDC5L | ||||
|  | ||||
| # Zipf-plot | ||||
| plot(log(1:length(myTbl)), log(as.numeric(myTbl)), | ||||
|      type = "b", cex = 0.7, | ||||
|      main = "STRINGedges - degrees", | ||||
|      xlab = "log(rank)", | ||||
|      ylab = "log(frequency)", | ||||
|      col = "#FFBB88") | ||||
|  | ||||
| sprintf("Average number of interactions: %5.2f", | ||||
|          nrow(STR) / length(unique(c(STR$a, STR$b)))) | ||||
|  | ||||
|  | ||||
| # =    6  Write to file  ======================================================= | ||||
|  | ||||
| saveRDS(STR, file = "./data/STRINGedges.rds") | ||||
|  | ||||
| # STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the | ||||
|                                                     # object when needed | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "scripts/ABC-makeSTRINGedges.R" | ||||
| # | ||||
| # Create a subnetwork of high-confidence human STRING edges. | ||||
| # | ||||
| # Notes: | ||||
| # | ||||
| #      The large source- datafile is NOT posted to github. If you want to | ||||
| #      experiment with the original data, download it and place it into your | ||||
| #      local  ./data  directory. | ||||
| # | ||||
| #      STRING data source: | ||||
| #        Download page: | ||||
| # https://string-db.org/cgi/download.pl?species_text=Homo+sapiens | ||||
| #        Data: (127.6 Mb) | ||||
| # https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz | ||||
| # | ||||
| # Version:  1.0 | ||||
| # | ||||
| # Date:     2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.0    Rewrite | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                             Line | ||||
| #TOC> ------------------------------------------------- | ||||
| #TOC>   1        Initialize                          44 | ||||
| #TOC>   2        Read STRING Data                    51 | ||||
| #TOC>   3        Define cutoff and subset            63 | ||||
| #TOC>   4        Drop  duplicates                   103 | ||||
| #TOC>   5        Simple statistics                  127 | ||||
| #TOC>   6        Write to file                      160 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  Initialize  ========================================================== | ||||
|  | ||||
| if (! requireNamespace("readr", quietly = TRUE)) { | ||||
|   install.packages("readr") | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    2  Read STRING Data  ==================================================== | ||||
|  | ||||
| # Read STRING Data (needs to be downloaded from database, see URL in Notes) | ||||
| # The .gz compressed version is 127.6MB, the uncompressed version is probably | ||||
| # 848 Mb. Fortunately readr:: can read from compressed | ||||
| # files, and does so automatically, based on the file extension. | ||||
| ( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") ) | ||||
| STR <- readr::read_delim(fn, delim = " ") | ||||
| nrow(STR)  #  11,759,454 rows | ||||
| head(STR) | ||||
|  | ||||
|  | ||||
| # =    3  Define cutoff and subset  ============================================ | ||||
|  | ||||
| # approximate distribution of combined_score | ||||
| hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF") | ||||
|  | ||||
| # Let's table the counts >= 850 and plot them for better resolution. | ||||
|  | ||||
| myTb <- table(STR$combined_score[STR$combined_score >= 850]) | ||||
| is.unsorted(as.integer(names(myTb)))  # Good - they are all in order | ||||
|  | ||||
| plot(myTb, type = "b", cex = 0.5, col = "#BB0000") | ||||
| myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that | ||||
|                          # frequently assigns a combined score of 0.900 | ||||
|  | ||||
| # Let's plot these counts as cumulative sums, in reverse order, scaled | ||||
| # as combined scores. | ||||
| myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing | ||||
| plot(myX, | ||||
|      cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing | ||||
|      xlim = c(1.0, 0.85),            # reverse x-axis | ||||
|      type = "l", | ||||
|      main = "STRING interactions for 9606 (top 600,000)", | ||||
|      xlab = "combined_score", | ||||
|      ylab = "cumulative counts", | ||||
|      col = "#CC0000") | ||||
| abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF") | ||||
|  | ||||
| # What's the cutoff for 100,000 edges? | ||||
| which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964 | ||||
|  | ||||
| # confirm | ||||
| sum(STR$combined_score >= 964) # 101,348 | ||||
| abline(v = 0.964, lwd = 0.5, col = "#DDDDFF") | ||||
|  | ||||
| # subset the table, and use only the protein IDs and the combined_score | ||||
| STR <- STR[STR$combined_score >= 964, | ||||
|             c("protein1", "protein2", "combined_score")] | ||||
| colnames(STR) <- c("a", "b", "score") | ||||
|  | ||||
|  | ||||
| # =    4  Drop  duplicates  ==================================================== | ||||
|  | ||||
| # identify duplicate interactions by creating keys in a defined alphabetical | ||||
| # sort order, then checking for  duplicated(). | ||||
| # e.g  if we have (X:U, U:X), we change U:X to X:U and now find that | ||||
| # (X:U, X:U) has a duplicate. | ||||
|  | ||||
| AB <- STR$a < STR$b        # logical vector: genes we need to swap | ||||
| tmp <- STR$b               # copy column b | ||||
| STR$b[AB] <- STR$a[AB]     # copy a's into b | ||||
| STR$a[AB] <- tmp[AB]       # copy tmp's into a | ||||
| all(STR$a >= STR$b)        # confirm: TRUE | ||||
|  | ||||
| # now, make combined keys, like this: | ||||
| paste0(STR$a[1:10], ":", STR$b[1:10]) | ||||
|  | ||||
| tmp <- paste0(STR$a, ":", STR$b) | ||||
| sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports | ||||
|                      # both a:b and b:a ! | ||||
|  | ||||
| # drop all duplicated interactions from tmp | ||||
| STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain | ||||
|  | ||||
|  | ||||
| # =    5  Simple statistics  =================================================== | ||||
|  | ||||
| # how many unique genes? | ||||
| length(unique(c(STR$a, STR$b)))   # 8,445 | ||||
|  | ||||
| # how many self-edges? | ||||
| sum(STR$a == STR$b)  # none | ||||
|  | ||||
| # log(rank) / log(frequency) | ||||
| myTbl <- table(c(STR$a, STR$b)) | ||||
| myTbl <- myTbl[order(myTbl, decreasing = TRUE)] | ||||
|  | ||||
| hist(myTbl, breaks = 40, col = "#FFEEBB") | ||||
|  | ||||
| # number of singletons | ||||
| sum(myTbl == 1) # almost a quarter | ||||
|  | ||||
| # maximum? | ||||
| myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465 | ||||
|                                    # Google: CDC5L | ||||
|  | ||||
| # Zipf-plot | ||||
| plot(log(1:length(myTbl)), log(as.numeric(myTbl)), | ||||
|      type = "b", cex = 0.7, | ||||
|      main = "STRINGedges - degrees", | ||||
|      xlab = "log(rank)", | ||||
|      ylab = "log(frequency)", | ||||
|      col = "#FFBB88") | ||||
|  | ||||
| sprintf("Average number of interactions: %5.2f", | ||||
|          nrow(STR) / length(unique(c(STR$a, STR$b)))) | ||||
|  | ||||
|  | ||||
| # =    6  Write to file  ======================================================= | ||||
|  | ||||
| saveRDS(STR, file = "./data/STRINGedges.rds") | ||||
|  | ||||
| # STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the | ||||
|                                                     # object when needed | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,167 +1,167 @@ | ||||
| # tocID <- "scripts/ABC-makeScCCnet.R" | ||||
| # | ||||
| # Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle" | ||||
| # GOSlim annotation. | ||||
| # | ||||
| # Boris Steipe for ABC learning units | ||||
| # | ||||
| # Notes: | ||||
| # | ||||
| #      The large source- datafiles are NOT posted to github. If you want to | ||||
| #      experiment with your own code, download them and place them into your | ||||
| #      local  ./data  directory. | ||||
| # | ||||
| #      STRING data source: | ||||
| #        Download page: | ||||
| # https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae | ||||
| #        Data: (20.1 mb) | ||||
| # https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz | ||||
| # | ||||
| #      GOSlim data source: (Note: this has moved from GO to SGD) | ||||
| #        Info page: https://www.yeastgenome.org/downloads | ||||
| #        Info page: http://sgd-archive.yeastgenome.org/curation/literature/ | ||||
| #        Data: (3 mb) | ||||
| # http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab | ||||
| # | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Update. GO Slim Yeast mow at SGD | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0    First code copied from 2016 material. | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # ============================================================================== | ||||
| # SRCDIR <- "./instructor" | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                           Line | ||||
| #TOC> --------------------------------------------------------------- | ||||
| #TOC>   1        INITIALIZE                                        58 | ||||
| #TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66 | ||||
| #TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96 | ||||
| #TOC>   3.1        Intersect interactions and annotations         122 | ||||
| #TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  INITIALIZE  ========================================================== | ||||
|  | ||||
| SRCDIR <- "./data" | ||||
| if (! requireNamespace("readr", quietly = TRUE)) { | ||||
|   install.packages("readr") | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    2  STRING FUNCTIONAL INTERACTION DATA  ================================== | ||||
|  | ||||
| # Read STRING Data (needs to be downloaded from database, see URL in Notes) | ||||
| # The .gz compressed version is 20MB, the uncompressed versioj is 110MB - | ||||
| # really not necessary to uncompress since readr:: can read from compressed | ||||
| # files, and does so automatically, based on the file extension. | ||||
| ( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") ) | ||||
| STR <- readr::read_delim(fn, delim = " ") | ||||
|  | ||||
| # Subset only IDs and combined_score column | ||||
| STR <- STR[ , c("protein1", "protein2", "combined_score")] | ||||
|  | ||||
| # head(STR) | ||||
| # sum(STR$combined_score > 909)  # 100270 edges | ||||
| # subset for 100,000 highest confidence edges | ||||
| STR <- STR[(STR$combined_score > 909), ] | ||||
| head(STR) | ||||
|  | ||||
| # IDs are formatted like 4932.YAL005C ... drop the "4932." prefix | ||||
| STR$protein1 <- gsub("^4932\\.", "", STR$protein1) | ||||
| STR$protein2 <- gsub("^4932\\.", "", STR$protein2) | ||||
| head(STR) | ||||
|  | ||||
| # get a vector of gene names in this list | ||||
| myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene | ||||
|                                                       # names | ||||
| length(myIntxGenes) | ||||
| sample(myIntxGenes, 10)  # choose 10 at random (sanity check) | ||||
|  | ||||
|  | ||||
| # =    3  GOSlim FUNCTIONAL ANNOTATIONS  ======================================= | ||||
| # | ||||
| # Read GOSlim data  (needs to be downloaded from database, see URL in Notes) | ||||
| ( fn <- file.path(SRCDIR, "go_slim_mapping.tab") ) | ||||
|  | ||||
| Gsl <- readr::read_tsv(fn, | ||||
|                        col_names = c("ID", | ||||
|                                      "name", | ||||
|                                      "SGDId", | ||||
|                                      "Ontology", | ||||
|                                      "termName", | ||||
|                                      "termID", | ||||
|                                      "status")) | ||||
|  | ||||
| head(Gsl) | ||||
|  | ||||
| # What cell cycle names does it contain? | ||||
| myGslTermNames <- unique(Gsl$termName)  # 169 unique terms | ||||
| myGslTermNames[grep("cycle", myGslTermNames)] | ||||
| # [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle" | ||||
|  | ||||
| # Choose "mitotic cell cycle" as the GOslim term to subset with | ||||
|  | ||||
| scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"]) | ||||
| length(scCCgenes)  # 324 genes annotated to that term | ||||
|  | ||||
| # ==   3.1  Intersect interactions and annotations  ============================ | ||||
|  | ||||
| sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence | ||||
| #                                # functional interactions | ||||
|  | ||||
|  | ||||
| # =    4  DEFINE THE CELL-CYCLE NETWORK  ======================================= | ||||
| # | ||||
| # Define scCCnet ... the S. Cervisiae Cell Cycle network | ||||
| # Subset all rows for which BOTH genes are in the GOslim cell cycle set | ||||
| # | ||||
| scCCnet <- STR[(STR$protein1 %in% scCCgenes) & | ||||
|                (STR$protein2 %in% scCCgenes), ] | ||||
|  | ||||
| # How many genes are there? | ||||
| length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283 | ||||
|  | ||||
| # Each edge is listed twice - now remove duplicates. | ||||
|  | ||||
| # Step 1: make a vector: sort two names so the fiRst one is alphabetically | ||||
| #         smaller Than the second one. This brings the two names into a defined | ||||
| #         order. Then concatenate them with a "." - the resulting string | ||||
| #         is always the same, for any order. E.g. c("A", "B") gives "A.B" | ||||
| #         and c("B", "A") also gives "A.B". This identifies duplicates. | ||||
|  | ||||
| x <- apply(cbind(scCCnet$protein1, scCCnet$protein2), | ||||
|            1, | ||||
|            FUN = function(x) { return(paste(sort(x), collapse = ".")) }) | ||||
| head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc. | ||||
|  | ||||
| sum(duplicated(x))  # 1453 | ||||
|  | ||||
| # Step 2: drop all rows that contain duplicates in x | ||||
| scCCnet <- scCCnet[! duplicated(x), ] | ||||
|  | ||||
| # Confirm we didn't loose genes | ||||
| length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change | ||||
| nrow(scCCnet) | ||||
| # Network has 283 nodes, 1453 edges | ||||
|  | ||||
| saveRDS(scCCnet, file = "./data/scCCnet.rds") | ||||
|  | ||||
| # scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the | ||||
|                                              #      object when needed | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "scripts/ABC-makeScCCnet.R" | ||||
| # | ||||
| # Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle" | ||||
| # GOSlim annotation. | ||||
| # | ||||
| # Boris Steipe for ABC learning units | ||||
| # | ||||
| # Notes: | ||||
| # | ||||
| #      The large source- datafiles are NOT posted to github. If you want to | ||||
| #      experiment with your own code, download them and place them into your | ||||
| #      local  ./data  directory. | ||||
| # | ||||
| #      STRING data source: | ||||
| #        Download page: | ||||
| # https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae | ||||
| #        Data: (20.1 mb) | ||||
| # https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz | ||||
| # | ||||
| #      GOSlim data source: (Note: this has moved from GO to SGD) | ||||
| #        Info page: https://www.yeastgenome.org/downloads | ||||
| #        Info page: http://sgd-archive.yeastgenome.org/curation/literature/ | ||||
| #        Data: (3 mb) | ||||
| # http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab | ||||
| # | ||||
| # | ||||
| # Version:  1.2 | ||||
| # | ||||
| # Date:     2017-10  -  2020-09 | ||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||
| # | ||||
| # Versions: | ||||
| #           1.2    2020 Update. GO Slim Yeast mow at SGD | ||||
| #           1.1    Change from require() to requireNamespace(), | ||||
| #                      use <package>::<function>() idiom throughout | ||||
| #           1.0    First code copied from 2016 material. | ||||
| # | ||||
| # TODO: | ||||
| # | ||||
| # ============================================================================== | ||||
| # SRCDIR <- "./instructor" | ||||
|  | ||||
|  | ||||
| #TOC> ========================================================================== | ||||
| #TOC>  | ||||
| #TOC>   Section  Title                                           Line | ||||
| #TOC> --------------------------------------------------------------- | ||||
| #TOC>   1        INITIALIZE                                        58 | ||||
| #TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66 | ||||
| #TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96 | ||||
| #TOC>   3.1        Intersect interactions and annotations         122 | ||||
| #TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128 | ||||
| #TOC>  | ||||
| #TOC> ========================================================================== | ||||
|  | ||||
|  | ||||
| # =    1  INITIALIZE  ========================================================== | ||||
|  | ||||
| SRCDIR <- "./data" | ||||
| if (! requireNamespace("readr", quietly = TRUE)) { | ||||
|   install.packages("readr") | ||||
| } | ||||
|  | ||||
|  | ||||
| # =    2  STRING FUNCTIONAL INTERACTION DATA  ================================== | ||||
|  | ||||
| # Read STRING Data (needs to be downloaded from database, see URL in Notes) | ||||
| # The .gz compressed version is 20MB, the uncompressed versioj is 110MB - | ||||
| # really not necessary to uncompress since readr:: can read from compressed | ||||
| # files, and does so automatically, based on the file extension. | ||||
| ( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") ) | ||||
| STR <- readr::read_delim(fn, delim = " ") | ||||
|  | ||||
| # Subset only IDs and combined_score column | ||||
| STR <- STR[ , c("protein1", "protein2", "combined_score")] | ||||
|  | ||||
| # head(STR) | ||||
| # sum(STR$combined_score > 909)  # 100270 edges | ||||
| # subset for 100,000 highest confidence edges | ||||
| STR <- STR[(STR$combined_score > 909), ] | ||||
| head(STR) | ||||
|  | ||||
| # IDs are formatted like 4932.YAL005C ... drop the "4932." prefix | ||||
| STR$protein1 <- gsub("^4932\\.", "", STR$protein1) | ||||
| STR$protein2 <- gsub("^4932\\.", "", STR$protein2) | ||||
| head(STR) | ||||
|  | ||||
| # get a vector of gene names in this list | ||||
| myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene | ||||
|                                                       # names | ||||
| length(myIntxGenes) | ||||
| sample(myIntxGenes, 10)  # choose 10 at random (sanity check) | ||||
|  | ||||
|  | ||||
| # =    3  GOSlim FUNCTIONAL ANNOTATIONS  ======================================= | ||||
| # | ||||
| # Read GOSlim data  (needs to be downloaded from database, see URL in Notes) | ||||
| ( fn <- file.path(SRCDIR, "go_slim_mapping.tab") ) | ||||
|  | ||||
| Gsl <- readr::read_tsv(fn, | ||||
|                        col_names = c("ID", | ||||
|                                      "name", | ||||
|                                      "SGDId", | ||||
|                                      "Ontology", | ||||
|                                      "termName", | ||||
|                                      "termID", | ||||
|                                      "status")) | ||||
|  | ||||
| head(Gsl) | ||||
|  | ||||
| # What cell cycle names does it contain? | ||||
| myGslTermNames <- unique(Gsl$termName)  # 169 unique terms | ||||
| myGslTermNames[grep("cycle", myGslTermNames)] | ||||
| # [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle" | ||||
|  | ||||
| # Choose "mitotic cell cycle" as the GOslim term to subset with | ||||
|  | ||||
| scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"]) | ||||
| length(scCCgenes)  # 324 genes annotated to that term | ||||
|  | ||||
| # ==   3.1  Intersect interactions and annotations  ============================ | ||||
|  | ||||
| sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence | ||||
| #                                # functional interactions | ||||
|  | ||||
|  | ||||
| # =    4  DEFINE THE CELL-CYCLE NETWORK  ======================================= | ||||
| # | ||||
| # Define scCCnet ... the S. Cervisiae Cell Cycle network | ||||
| # Subset all rows for which BOTH genes are in the GOslim cell cycle set | ||||
| # | ||||
| scCCnet <- STR[(STR$protein1 %in% scCCgenes) & | ||||
|                (STR$protein2 %in% scCCgenes), ] | ||||
|  | ||||
| # How many genes are there? | ||||
| length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283 | ||||
|  | ||||
| # Each edge is listed twice - now remove duplicates. | ||||
|  | ||||
| # Step 1: make a vector: sort two names so the fiRst one is alphabetically | ||||
| #         smaller Than the second one. This brings the two names into a defined | ||||
| #         order. Then concatenate them with a "." - the resulting string | ||||
| #         is always the same, for any order. E.g. c("A", "B") gives "A.B" | ||||
| #         and c("B", "A") also gives "A.B". This identifies duplicates. | ||||
|  | ||||
| x <- apply(cbind(scCCnet$protein1, scCCnet$protein2), | ||||
|            1, | ||||
|            FUN = function(x) { return(paste(sort(x), collapse = ".")) }) | ||||
| head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc. | ||||
|  | ||||
| sum(duplicated(x))  # 1453 | ||||
|  | ||||
| # Step 2: drop all rows that contain duplicates in x | ||||
| scCCnet <- scCCnet[! duplicated(x), ] | ||||
|  | ||||
| # Confirm we didn't loose genes | ||||
| length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change | ||||
| nrow(scCCnet) | ||||
| # Network has 283 nodes, 1453 edges | ||||
|  | ||||
| saveRDS(scCCnet, file = "./data/scCCnet.rds") | ||||
|  | ||||
| # scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the | ||||
|                                              #      object when needed | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,135 +1,135 @@ | ||||
| # tocID <- "scripts/ABC-writeALN.R" | ||||
| # | ||||
| # ToDo:    calculate consensus line | ||||
| #          append sequence numbers | ||||
| # Notes: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| writeALN <- function(ali, | ||||
|                      range, | ||||
|                      note = "", | ||||
|                      myCon = stdout(), | ||||
|                      blockWidth = 60) { | ||||
|   # Purpose: | ||||
|   #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or | ||||
|   #     a file in multi-FASTA format. | ||||
|   # Version: 2.0 | ||||
|   # Date:    2017 10 | ||||
|   # Author:  Boris Steipe | ||||
|   # | ||||
|   # Parameters: | ||||
|   #     ali             MsaAAMultipleAlignment or AAStringSet or character | ||||
|   #                       vector. | ||||
|   #     range      num  a two-integer vector of start and end positions if | ||||
|   #                       only a range of the MSA should be written, e.g. | ||||
|   #                       a domain. Defaults to the full alignment length. | ||||
|   #     note       chr  a vector of character that is appended to the name | ||||
|   #                       of a sequence in the FASTA header. Recycling of | ||||
|   #                       shorter vectors applies, thus a vector of length one | ||||
|   #                       is added to all headers. | ||||
|   #     myCon           a connection (cf. the con argument for writeLines). | ||||
|   #                       Defaults to stdout() | ||||
|   #     blockWidth int  width of sequence block. Default 80 characters. | ||||
|   # Value: | ||||
|   #     NA   the function is invoked for its side effect of printing an | ||||
|   #          alignment to stdout() or file. | ||||
|  | ||||
|   blockWidth <- as.integer(blockWidth) | ||||
|   if (is.na(blockWidth)) { | ||||
|     stop("PANIC: parameter \"blockWidth\" must be numeric.") | ||||
|   } | ||||
|   if (blockWidth < 1) { | ||||
|     stop("PANIC: parameter \"blockWidth\" must be greater than zero.") | ||||
|   } | ||||
|   if (blockWidth > 60) { | ||||
|     warning("Programs that read CLUSTAL format might not expect blockWidth > 60.") | ||||
|   } | ||||
|  | ||||
|   # Extract the raw data from the objects depending on their respective class | ||||
|   # and put it into a named vector of strings. | ||||
|  | ||||
|   # Extract XStringSet from MsaXMultipleAlignment ... | ||||
|   if (class(ali) == "MsaAAMultipleAlignment" | | ||||
|       class(ali) == "MsaDNAMultipleAlignment" | | ||||
|       class(ali) == "MsaRNAMultipleAlignment") { | ||||
|       ali <- ali@unmasked | ||||
|   } | ||||
|  | ||||
|   # Process XStringSet | ||||
|   if (class(ali) == "AAStringSet" | | ||||
|       class(ali) == "DNAStringSet" | | ||||
|       class(ali) == "RNAStringSet") { | ||||
|     sSet <- as.character(ali) # we use as.character(), not toString() thus | ||||
|                               # we don't _have_ to load Biostrings | ||||
|   } else if (class(ali) == "character") { | ||||
|     sSet <- ali | ||||
|   } else { | ||||
|     stop(paste("Input object of class", | ||||
|                class(ali), | ||||
|                "can't be handled by this function.")) | ||||
|   } | ||||
|  | ||||
|   if (missing(range)) { | ||||
|     range <- 1 | ||||
|     range[2] <- max(nchar(sSet)) | ||||
|   } else { | ||||
|     range <- as.integer(range) | ||||
|     if(length(range) != 2 || | ||||
|        any(is.na(range)) || | ||||
|        range[1] > range[2] || | ||||
|        range[1] < 1) { | ||||
|       stop("PANIC: \"range\" parameter must contain valid start and end index.") | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   # Right-pad any sequence with "-" that is shorter than ranges[2] | ||||
|     for (i in seq_along(sSet)) { | ||||
|       if (nchar(sSet[i]) < range[2]) { | ||||
|         sSet[i] <- paste0(sSet[i], | ||||
|                           paste0(rep("-", range[2] - nchar(sSet[i])), | ||||
|                                  collapse = "")) | ||||
|       } | ||||
|     } | ||||
|  | ||||
|   # Right-pad sequence names | ||||
|   sNames <- names(sSet) | ||||
|   len <- max(nchar(sNames)) + 2 # longest name plus two spaces | ||||
|   for (i in seq_along(sNames)) { | ||||
|     sNames[i] <- paste0(sNames[i], | ||||
|                       paste0(rep(" ", len - nchar(sNames[i])), | ||||
|                              collapse = "")) | ||||
|   } | ||||
|  | ||||
|  | ||||
|   # Process each sequence | ||||
|   txt <- paste0("CLUSTAL W format. ", note) | ||||
|   txt[2] <- "" | ||||
|  | ||||
|   iStarts <- seq(range[1], range[2], by = blockWidth) | ||||
|   iEnds <- c((iStarts[-1] - 1), range[2]) | ||||
|  | ||||
|   for (i in seq_along(iStarts)) { | ||||
|     for (j in seq_along(sSet)) { | ||||
|       txt <- c(txt, | ||||
|                paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i]))) | ||||
|     } | ||||
|     txt <- c(txt, "")  # append a blank consenus line | ||||
|     txt <- c(txt, "")  # append a separator line | ||||
|   } | ||||
|  | ||||
|   writeLines(txt, con= myCon) | ||||
|  | ||||
| } | ||||
|  | ||||
| # ====  TESTS  ================================================================= | ||||
| # Enter your function tests here... | ||||
|  | ||||
| if (FALSE) { | ||||
|   # test ... | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # tocID <- "scripts/ABC-writeALN.R" | ||||
| # | ||||
| # ToDo:    calculate consensus line | ||||
| #          append sequence numbers | ||||
| # Notes: | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| writeALN <- function(ali, | ||||
|                      range, | ||||
|                      note = "", | ||||
|                      myCon = stdout(), | ||||
|                      blockWidth = 60) { | ||||
|   # Purpose: | ||||
|   #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or | ||||
|   #     a file in multi-FASTA format. | ||||
|   # Version: 2.0 | ||||
|   # Date:    2017 10 | ||||
|   # Author:  Boris Steipe | ||||
|   # | ||||
|   # Parameters: | ||||
|   #     ali             MsaAAMultipleAlignment or AAStringSet or character | ||||
|   #                       vector. | ||||
|   #     range      num  a two-integer vector of start and end positions if | ||||
|   #                       only a range of the MSA should be written, e.g. | ||||
|   #                       a domain. Defaults to the full alignment length. | ||||
|   #     note       chr  a vector of character that is appended to the name | ||||
|   #                       of a sequence in the FASTA header. Recycling of | ||||
|   #                       shorter vectors applies, thus a vector of length one | ||||
|   #                       is added to all headers. | ||||
|   #     myCon           a connection (cf. the con argument for writeLines). | ||||
|   #                       Defaults to stdout() | ||||
|   #     blockWidth int  width of sequence block. Default 80 characters. | ||||
|   # Value: | ||||
|   #     NA   the function is invoked for its side effect of printing an | ||||
|   #          alignment to stdout() or file. | ||||
|  | ||||
|   blockWidth <- as.integer(blockWidth) | ||||
|   if (is.na(blockWidth)) { | ||||
|     stop("PANIC: parameter \"blockWidth\" must be numeric.") | ||||
|   } | ||||
|   if (blockWidth < 1) { | ||||
|     stop("PANIC: parameter \"blockWidth\" must be greater than zero.") | ||||
|   } | ||||
|   if (blockWidth > 60) { | ||||
|     warning("Programs that read CLUSTAL format might not expect blockWidth > 60.") | ||||
|   } | ||||
|  | ||||
|   # Extract the raw data from the objects depending on their respective class | ||||
|   # and put it into a named vector of strings. | ||||
|  | ||||
|   # Extract XStringSet from MsaXMultipleAlignment ... | ||||
|   if (class(ali) == "MsaAAMultipleAlignment" | | ||||
|       class(ali) == "MsaDNAMultipleAlignment" | | ||||
|       class(ali) == "MsaRNAMultipleAlignment") { | ||||
|       ali <- ali@unmasked | ||||
|   } | ||||
|  | ||||
|   # Process XStringSet | ||||
|   if (class(ali) == "AAStringSet" | | ||||
|       class(ali) == "DNAStringSet" | | ||||
|       class(ali) == "RNAStringSet") { | ||||
|     sSet <- as.character(ali) # we use as.character(), not toString() thus | ||||
|                               # we don't _have_ to load Biostrings | ||||
|   } else if (class(ali) == "character") { | ||||
|     sSet <- ali | ||||
|   } else { | ||||
|     stop(paste("Input object of class", | ||||
|                class(ali), | ||||
|                "can't be handled by this function.")) | ||||
|   } | ||||
|  | ||||
|   if (missing(range)) { | ||||
|     range <- 1 | ||||
|     range[2] <- max(nchar(sSet)) | ||||
|   } else { | ||||
|     range <- as.integer(range) | ||||
|     if(length(range) != 2 || | ||||
|        any(is.na(range)) || | ||||
|        range[1] > range[2] || | ||||
|        range[1] < 1) { | ||||
|       stop("PANIC: \"range\" parameter must contain valid start and end index.") | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   # Right-pad any sequence with "-" that is shorter than ranges[2] | ||||
|     for (i in seq_along(sSet)) { | ||||
|       if (nchar(sSet[i]) < range[2]) { | ||||
|         sSet[i] <- paste0(sSet[i], | ||||
|                           paste0(rep("-", range[2] - nchar(sSet[i])), | ||||
|                                  collapse = "")) | ||||
|       } | ||||
|     } | ||||
|  | ||||
|   # Right-pad sequence names | ||||
|   sNames <- names(sSet) | ||||
|   len <- max(nchar(sNames)) + 2 # longest name plus two spaces | ||||
|   for (i in seq_along(sNames)) { | ||||
|     sNames[i] <- paste0(sNames[i], | ||||
|                       paste0(rep(" ", len - nchar(sNames[i])), | ||||
|                              collapse = "")) | ||||
|   } | ||||
|  | ||||
|  | ||||
|   # Process each sequence | ||||
|   txt <- paste0("CLUSTAL W format. ", note) | ||||
|   txt[2] <- "" | ||||
|  | ||||
|   iStarts <- seq(range[1], range[2], by = blockWidth) | ||||
|   iEnds <- c((iStarts[-1] - 1), range[2]) | ||||
|  | ||||
|   for (i in seq_along(iStarts)) { | ||||
|     for (j in seq_along(sSet)) { | ||||
|       txt <- c(txt, | ||||
|                paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i]))) | ||||
|     } | ||||
|     txt <- c(txt, "")  # append a blank consenus line | ||||
|     txt <- c(txt, "")  # append a separator line | ||||
|   } | ||||
|  | ||||
|   writeLines(txt, con= myCon) | ||||
|  | ||||
| } | ||||
|  | ||||
| # ====  TESTS  ================================================================= | ||||
| # Enter your function tests here... | ||||
|  | ||||
| if (FALSE) { | ||||
|   # test ... | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
| @@ -1,121 +1,121 @@ | ||||
| # ABC-writeMFA.R | ||||
| # | ||||
| # ToDo: | ||||
| # Notes:  2.1  bugfix: empty notes caused superfluous blank after header. | ||||
| # | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| writeMFA <- function(ali, | ||||
|                      range, | ||||
|                      note = "", | ||||
|                      myCon = stdout(), | ||||
|                      blockWidth = 80) { | ||||
|   # Purpose: | ||||
|   #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or | ||||
|   #     a file in multi-FASTA format. | ||||
|   # Version: 2.1 | ||||
|   # Date:    2017  10 | ||||
|   # Author:  Boris Steipe | ||||
|   # | ||||
|   # Parameters: | ||||
|   #     ali             MsaAAMultipleAlignment or AAStringSet or character | ||||
|   #                       vector | ||||
|   #     range      num  a two-integer vector of start and end positions if | ||||
|   #                       only a range of the MSA should be written, e.g. | ||||
|   #                       a domain. Defaults to the full sequence length. | ||||
|   #     note       chr  a vector of character that is appended to the name | ||||
|   #                       of a sequence in the FASTA header. Recycling of | ||||
|   #                       shorter vectors applies, thus a vector of length one | ||||
|   #                       is added to all headers. | ||||
|   #     myCon           a connection (cf. the con argument for writeLines). | ||||
|   #                       Defaults to stdout() | ||||
|   #     blockWidth int  width of sequence block. Default 80 characters. | ||||
|   # Value: | ||||
|   #     NA   the function is invoked for its side effect of printing an | ||||
|   #          alignment to stdout() or file. | ||||
|  | ||||
|   blockWidth <- as.integer(blockWidth) | ||||
|   if (is.na(blockWidth)) { | ||||
|     stop("PANIC: parameter \"blockWidth\" must be numeric.") | ||||
|   } | ||||
|   if (! blockWidth > 0){ | ||||
|     stop("PANIC: parameter \"blockWidth\" must be greater than zero.") | ||||
|   } | ||||
|  | ||||
|   # Extract the raw data from the objects depending on their respective class | ||||
|   # and put it into a named vector of strings. | ||||
|  | ||||
|   # Extract XStringSet from MsaXMultipleAlignment ... | ||||
|   if (class(ali) == "MsaAAMultipleAlignment" | | ||||
|       class(ali) == "MsaDNAMultipleAlignment" | | ||||
|       class(ali) == "MsaRNAMultipleAlignment") { | ||||
|       ali <- ali@unmasked | ||||
|   } | ||||
|  | ||||
|   # Process XStringSet | ||||
|   if (class(ali) == "AAStringSet" | | ||||
|       class(ali) == "DNAStringSet" | | ||||
|       class(ali) == "RNAStringSet") { | ||||
|     sSet <- as.character(ali) # we use as.character(), not toString() thus | ||||
|                               # we don't _have_ to load Biostrings | ||||
|   } else if (class(ali) == "character") { | ||||
|     sSet <- ali | ||||
|   } else { | ||||
|     stop(paste("Input object of class", | ||||
|                class(ali), | ||||
|                "can't be handled by this function.")) | ||||
|   } | ||||
|  | ||||
|   if (missing(range)) { | ||||
|     range <- 1 | ||||
|     range[2] <- max(nchar(sSet)) | ||||
|   } else { | ||||
|     range <- as.integer(range) | ||||
|     if(length(range) != 2 || | ||||
|        any(is.na(range)) || | ||||
|        range[1] > range[2] || | ||||
|        range[1] < 1) { | ||||
|       stop("PANIC: \"range\" parameter must contain valid start and end index.") | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   # Process each sequence | ||||
|   txt <- character() | ||||
|   if (note != "") {  # construct header line | ||||
|     headers <- paste(names(sSet), note) | ||||
|   } else { | ||||
|     headers <- names(sSet) | ||||
|   } | ||||
|  | ||||
|   for (i in seq_along(sSet)) { | ||||
|  | ||||
|     # output FASTA header | ||||
|     txt <- c(txt, sprintf(">%s", headers[i])) | ||||
|  | ||||
|     # output the sequence in blocks of blockWidth per line ... | ||||
|     iStarts <- seq(range[1], range[2], by = blockWidth) | ||||
|     iEnds <- c((iStarts[-1] - 1), range[2]) | ||||
|  | ||||
|     thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks | ||||
|     thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks | ||||
|     txt <- c(txt, thisSeq) | ||||
|  | ||||
|     txt <- c(txt, "")  # append an empty line for readability | ||||
|   } | ||||
|  | ||||
|   writeLines(txt, con = myCon) | ||||
|  | ||||
| } | ||||
|  | ||||
| # ====  TESTS  ================================================================= | ||||
| # Enter your function tests here... | ||||
|  | ||||
| if (FALSE) { | ||||
|   # test ... | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # ABC-writeMFA.R | ||||
| # | ||||
| # ToDo: | ||||
| # Notes:  2.1  bugfix: empty notes caused superfluous blank after header. | ||||
| # | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| writeMFA <- function(ali, | ||||
|                      range, | ||||
|                      note = "", | ||||
|                      myCon = stdout(), | ||||
|                      blockWidth = 80) { | ||||
|   # Purpose: | ||||
|   #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or | ||||
|   #     a file in multi-FASTA format. | ||||
|   # Version: 2.1 | ||||
|   # Date:    2017  10 | ||||
|   # Author:  Boris Steipe | ||||
|   # | ||||
|   # Parameters: | ||||
|   #     ali             MsaAAMultipleAlignment or AAStringSet or character | ||||
|   #                       vector | ||||
|   #     range      num  a two-integer vector of start and end positions if | ||||
|   #                       only a range of the MSA should be written, e.g. | ||||
|   #                       a domain. Defaults to the full sequence length. | ||||
|   #     note       chr  a vector of character that is appended to the name | ||||
|   #                       of a sequence in the FASTA header. Recycling of | ||||
|   #                       shorter vectors applies, thus a vector of length one | ||||
|   #                       is added to all headers. | ||||
|   #     myCon           a connection (cf. the con argument for writeLines). | ||||
|   #                       Defaults to stdout() | ||||
|   #     blockWidth int  width of sequence block. Default 80 characters. | ||||
|   # Value: | ||||
|   #     NA   the function is invoked for its side effect of printing an | ||||
|   #          alignment to stdout() or file. | ||||
|  | ||||
|   blockWidth <- as.integer(blockWidth) | ||||
|   if (is.na(blockWidth)) { | ||||
|     stop("PANIC: parameter \"blockWidth\" must be numeric.") | ||||
|   } | ||||
|   if (! blockWidth > 0){ | ||||
|     stop("PANIC: parameter \"blockWidth\" must be greater than zero.") | ||||
|   } | ||||
|  | ||||
|   # Extract the raw data from the objects depending on their respective class | ||||
|   # and put it into a named vector of strings. | ||||
|  | ||||
|   # Extract XStringSet from MsaXMultipleAlignment ... | ||||
|   if (class(ali) == "MsaAAMultipleAlignment" | | ||||
|       class(ali) == "MsaDNAMultipleAlignment" | | ||||
|       class(ali) == "MsaRNAMultipleAlignment") { | ||||
|       ali <- ali@unmasked | ||||
|   } | ||||
|  | ||||
|   # Process XStringSet | ||||
|   if (class(ali) == "AAStringSet" | | ||||
|       class(ali) == "DNAStringSet" | | ||||
|       class(ali) == "RNAStringSet") { | ||||
|     sSet <- as.character(ali) # we use as.character(), not toString() thus | ||||
|                               # we don't _have_ to load Biostrings | ||||
|   } else if (class(ali) == "character") { | ||||
|     sSet <- ali | ||||
|   } else { | ||||
|     stop(paste("Input object of class", | ||||
|                class(ali), | ||||
|                "can't be handled by this function.")) | ||||
|   } | ||||
|  | ||||
|   if (missing(range)) { | ||||
|     range <- 1 | ||||
|     range[2] <- max(nchar(sSet)) | ||||
|   } else { | ||||
|     range <- as.integer(range) | ||||
|     if(length(range) != 2 || | ||||
|        any(is.na(range)) || | ||||
|        range[1] > range[2] || | ||||
|        range[1] < 1) { | ||||
|       stop("PANIC: \"range\" parameter must contain valid start and end index.") | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   # Process each sequence | ||||
|   txt <- character() | ||||
|   if (note != "") {  # construct header line | ||||
|     headers <- paste(names(sSet), note) | ||||
|   } else { | ||||
|     headers <- names(sSet) | ||||
|   } | ||||
|  | ||||
|   for (i in seq_along(sSet)) { | ||||
|  | ||||
|     # output FASTA header | ||||
|     txt <- c(txt, sprintf(">%s", headers[i])) | ||||
|  | ||||
|     # output the sequence in blocks of blockWidth per line ... | ||||
|     iStarts <- seq(range[1], range[2], by = blockWidth) | ||||
|     iEnds <- c((iStarts[-1] - 1), range[2]) | ||||
|  | ||||
|     thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks | ||||
|     thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks | ||||
|     txt <- c(txt, thisSeq) | ||||
|  | ||||
|     txt <- c(txt, "")  # append an empty line for readability | ||||
|   } | ||||
|  | ||||
|   writeLines(txt, con = myCon) | ||||
|  | ||||
| } | ||||
|  | ||||
| # ====  TESTS  ================================================================= | ||||
| # Enter your function tests here... | ||||
|  | ||||
| if (FALSE) { | ||||
|   # test ... | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
							
								
								
									
										768
									
								
								scripts/BLAST.R
									
									
									
									
									
								
							
							
						
						
									
										768
									
								
								scripts/BLAST.R
									
									
									
									
									
								
							| @@ -1,384 +1,384 @@ | ||||
| # BLAST.R | ||||
| # | ||||
| # Purpose: Send off one BLAST search and return parsed list of results | ||||
| #          This script uses the BLAST URL-API | ||||
| #          (Application Programming Interface) at the NCBI. | ||||
| #          Read about the constraints here: | ||||
| #          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo | ||||
| # | ||||
| # | ||||
| # Version: 3.2 | ||||
| # Date:    2016 09 - 2020 09 | ||||
| # Author:  Boris Steipe | ||||
| # | ||||
| # Versions: | ||||
| #    3.2   2020 updates | ||||
| #    3.1   Change from require() to requireNamespace(), | ||||
| #          use <package>::<function>() idiom throughout | ||||
| #    3.0   parsing logic had not been fully implemented; Fixed. | ||||
| #    2.1   bugfix in BLAST(), bug was blanking non-split deflines; | ||||
| #          refactored parseBLASTalignment() to handle lists with multiple hits. | ||||
| #    2.0   Completely rewritten because the interface completely changed. | ||||
| #          Code adpated in part from NCBI Perl sample code: | ||||
| #          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $ | ||||
| #    1.0   first version posted for BCH441 2016, based on BLAST - API | ||||
| # | ||||
| # ToDo:    Return the organism/strain name in the output, and propagate | ||||
| #          into MYSPE selection script. | ||||
| # | ||||
| # Notes:   This is somewhat pedestrian, but apparently there are currently | ||||
| #          no R packages that contain such code. | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
|  | ||||
|  | ||||
| BLAST <- function(Q, | ||||
|                   db = "refseq_protein", | ||||
|                   nHits = 30, | ||||
|                   E = 0.1, | ||||
|                   limits = "", | ||||
|                   rid = "", | ||||
|                   query = "", | ||||
|                   quietly = FALSE, | ||||
|                   myTimeout = 120) { | ||||
|     # Purpose: | ||||
|     #     Basic BLAST search | ||||
|     # | ||||
|     # Parameters: | ||||
|     #     Q: query - either a valid ID or a sequence | ||||
|     #     db: "refseq_protein" by default, | ||||
|     #         other legal values include: "nr", "pdb", "swissprot" ... | ||||
|     #     nHits: number of hits to maximally return | ||||
|     #     E: E-value cutoff. Do not return hits whose score would be expected | ||||
|     #        to occur E or more times in a database of random sequence. | ||||
|     #     limits: a valid ENTREZ filter | ||||
|     #     rid: a request ID - to retrieve earlier search results | ||||
|     #     query: the actual query string (needed when retrieving results | ||||
|     #            with an rid) | ||||
|     #     quietly: controls printing of wait-time progress bar | ||||
|     #     timeout: how much longer _after_ rtoe to wait for a result | ||||
|     #              before giving up (seconds) | ||||
|     # Value: | ||||
|     #     result: list of process status or resulting hits, and some metadata | ||||
|  | ||||
|  | ||||
|     EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done | ||||
|  | ||||
|     results <- list() | ||||
|     results$query = query | ||||
|     results$rid <- rid | ||||
|     results$rtoe <- 0 | ||||
|  | ||||
|     if (rid == "") {  # If no rid is available, spawn a search. | ||||
|                       # Else, proceed directly to retrieval. | ||||
|  | ||||
|       # prepare query, GET(), and parse rid and rtoe from BLAST server response | ||||
|       results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", | ||||
|                               "?", | ||||
|                               "CMD=Put", | ||||
|                               "&PROGRAM=", "blastp", | ||||
|                               "&QUERY=", URLencode(Q), | ||||
|                               "&DATABASE=", db, | ||||
|                               "&MATRIX=", "BLOSUM62", | ||||
|                               "&EXPECT=", as.character(E), | ||||
|                               "&HITLIST_SIZE=", as.character(nHits), | ||||
|                               "&ALIGNMENTS=", as.character(nHits), | ||||
|                               "&FORMAT_TYPE=Text") | ||||
|  | ||||
|       if (limits != "") { | ||||
|         results$query <- paste0( | ||||
|           results$query, | ||||
|           "&ENTREZ_QUERY=", limits) | ||||
|       } | ||||
|  | ||||
|       # send it off ... | ||||
|       response <- httr::GET(results$query) | ||||
|       if (httr::http_status(response)$category != "Success" ) { | ||||
|         stop(sprintf("PANIC: Can't send query. BLAST server status error: %s", | ||||
|                      httr::http_status(response)$message)) | ||||
|       } | ||||
|  | ||||
|       txt <- httr::content(response, "text", encoding = "UTF-8") | ||||
|  | ||||
|       patt <- "RID = (\\w+)" # match the request id | ||||
|       results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2] | ||||
|  | ||||
|       patt <- "RTOE = (\\d+)" # match the expected completion time | ||||
|       results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2]) | ||||
|  | ||||
|       # Now we wait ... | ||||
|       if (quietly) { | ||||
|         Sys.sleep(results$rtoe) | ||||
|       } else { | ||||
|         cat(sprintf("BLAST is processing %s:\n", results$rid)) | ||||
|         waitTimer(results$rtoe) | ||||
|       } | ||||
|  | ||||
|     } # done sending query and retrieving rid, rtoe | ||||
|  | ||||
|     # Enter an infinite loop to check for result availability | ||||
|     checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", | ||||
|                          "?", | ||||
|                          "CMD=Get", | ||||
|                          "&RID=", results$rid, | ||||
|                          "&FORMAT_TYPE=Text", | ||||
|                          "&FORMAT_OBJECT=SearchInfo", | ||||
|                          sep = "") | ||||
|  | ||||
|     while (TRUE) { | ||||
|       # Check whether the result is ready | ||||
|       response <- httr::GET(checkStatus) | ||||
|       if (httr::http_status(response)$category != "Success" ) { | ||||
|         stop(sprintf("PANIC: Can't check status. BLAST server status error: %s", | ||||
|                      httr::http_status(response)$message)) | ||||
|       } | ||||
|  | ||||
|       txt <- httr::content(response, "text", encoding = "UTF-8") | ||||
|  | ||||
|       if (length(grep("Status=WAITING",  txt)) > 0) { | ||||
|         myTimeout <- myTimeout - EXTRAWAIT | ||||
|  | ||||
|         if (myTimeout <= 0) { # abort | ||||
|           cat("BLAST search not concluded before timeout. Aborting.\n") | ||||
|           cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n", | ||||
|                       "Trying checking back later with >", | ||||
|                       results$rid)) | ||||
|           return(results) | ||||
|         } | ||||
|  | ||||
|         if (quietly) { | ||||
|           Sys.sleep(EXTRAWAIT) | ||||
|         } else { | ||||
|           cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)", | ||||
|                       EXTRAWAIT, | ||||
|                       myTimeout)) | ||||
|           waitTimer(EXTRAWAIT) | ||||
|           next | ||||
|         } | ||||
|  | ||||
|       } else if (length(grep("Status=FAILED",  txt)) > 0) { | ||||
|           cat("BLAST search returned status \"FAILED\". Aborting.\n") | ||||
|           return(results) | ||||
|  | ||||
|       } else if (length(grep("Status=UNKNOWN",  txt)) > 0) { | ||||
|           cat("BLAST search returned status \"UNKNOWN\".\n") | ||||
|           cat("This probably means the rid has expired. Aborting.\n") | ||||
|           return(results) | ||||
|  | ||||
|       } else if (length(grep("Status=READY",  txt)) > 0) {  # Done | ||||
|  | ||||
|           if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits | ||||
|             cat("BLAST search ready but no hits found. Aborting.\n") | ||||
|             return(results) | ||||
|  | ||||
|           } else { | ||||
|             break  # done ... retrieve search result | ||||
|           } | ||||
|       } | ||||
|     } # end result-check loop | ||||
|  | ||||
|     # retrieve results from BLAST server | ||||
|     retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", | ||||
|                       "?", | ||||
|                       "&CMD=Get", | ||||
|                       "&RID=", results$rid, | ||||
|                       "&FORMAT_TYPE=Text", | ||||
|                       sep = "") | ||||
|  | ||||
|     response <- httr::GET(retrieve) | ||||
|     if (httr::http_status(response)$category != "Success" ) { | ||||
|       stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s", | ||||
|                    httr::http_status(response)$message)) | ||||
|     } | ||||
|  | ||||
|     txt <- httr::content(response, "text", encoding = "UTF-8") | ||||
|  | ||||
|     # txt contains the whole set of results. Process: | ||||
|  | ||||
|     # First, we strsplit() on linebreaks: | ||||
|     txt <- unlist(strsplit(txt, "\n")) | ||||
|  | ||||
|     # The alignments range from the first line that begins with ">" ... | ||||
|     iFirst <- grep("^>", txt)[1] | ||||
|  | ||||
|     # ... to the last line that begins with "Sbjct" | ||||
|     x <- grep("^Sbjct", txt) | ||||
|     iLast <- x[length(x)] | ||||
|  | ||||
|     # Get the alignments block | ||||
|     txt <- txt[iFirst:iLast] | ||||
|  | ||||
|     # Drop empty lines | ||||
|     txt <- txt[!(nchar(txt) == 0)] | ||||
|  | ||||
|     # A line that ends "]" but does not begin ">" seems to be a split | ||||
|     # defline ... eg. | ||||
|     #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale " | ||||
|     #  [2] "EXF-2481]" | ||||
|     #  Merge these lines to the preceding lines and delete them. | ||||
|     # | ||||
|     x <- which(grepl("]$", txt) & !(grepl("^>", txt))) | ||||
|     if (length(x) > 0) { | ||||
|       txt[x-1] <- paste0(txt[x-1], txt[x]) | ||||
|       txt <- txt[-x] | ||||
|     } | ||||
|  | ||||
|     # Special case: there may be multiple deflines when the BLAST hit is to | ||||
|     # redundant, identical sequences. Keep only the first instance. | ||||
|     iKeep <- ! grepl("^>", txt) | ||||
|     x <- rle(iKeep) | ||||
|     x$positions <- cumsum(x$lengths) | ||||
|     i <- which(x$lengths > 1 & x$values == FALSE) | ||||
|     if (length(i) > 0) { | ||||
|       firsts <- x$positions[i] - x$lengths[i] + 1 | ||||
|       iKeep[firsts] <- TRUE | ||||
|       txt <- txt[iKeep] | ||||
|     } | ||||
|  | ||||
|     # After this preprocessing the following should be true: | ||||
|     # - Every alignment block begins with a defline in which the | ||||
|     #   first character is ">" | ||||
|     # - There is only one defline in each block. | ||||
|     # - Lines are not split. | ||||
|  | ||||
|     # Make a dataframe of first and last indices of alignment blocks | ||||
|     x <- grep("^>", txt) | ||||
|     blocks <- data.frame(iFirst = x, | ||||
|                          iLast  = c((x[-1] - 1), length(txt))) | ||||
|  | ||||
|     # Build the hits list by parsing the blocks | ||||
|     results$hits <- list() | ||||
|  | ||||
|     for (i in seq_len(nrow(blocks))) { | ||||
|       thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]] | ||||
|       results$hits[[i]] <- parseBLASTalignment(thisBlock) | ||||
|     } | ||||
|  | ||||
|     return(results) | ||||
| } | ||||
|  | ||||
| parseBLASTalignment <- function(hit) { | ||||
|   # Parse data from a character vector containing a BLAST hit | ||||
|   # Parameters: | ||||
|   #    hit  char   one BLAST hit as char vector | ||||
|   # Value: | ||||
|   #          list   $def          chr   defline | ||||
|   #                 $accession    chr   accession number | ||||
|   #                 $organism     chr   complete organism definition | ||||
|   #                 $species      chr   binomial species | ||||
|   #                 $E            num   E value | ||||
|   #                 $lengthAli    num   length of the alignment | ||||
|   #                 $nIdentitites num   number of identities | ||||
|   #                 $nGaps        num   number of gaps | ||||
|   #                 $Qbounds      num   2-element vector of query start-end | ||||
|   #                 $Sbounds      num   2-element vector of subject start-end | ||||
|   #                 $Qseq         chr   query sequence | ||||
|   #                 $midSeq       chr   midline string | ||||
|   #                 $Sseq         chr   subject sequence | ||||
|  | ||||
|   getToken <- function(patt, v) { | ||||
|     # get the first token identified by pattern patt in character vector v | ||||
|     v <- v[grep(patt, v)] | ||||
|     if (length(v) > 1) { v <- v[1] } | ||||
|     if (length(v) == 0) { token <- NA | ||||
|     } else { | ||||
|       token <- regmatches(v, regexec(patt, v))[[1]][2] } | ||||
|     return(token) | ||||
|   } | ||||
|  | ||||
|   h <- list() | ||||
|  | ||||
|   # FASTA defline | ||||
|   h$def <- hit[1] | ||||
|  | ||||
|   # accesion number (ID), use the first if there are several, separated by "|" | ||||
|   patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|" | ||||
|   h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2] | ||||
|  | ||||
|   # organism | ||||
|   patt <- "\\[(.+)]" | ||||
|   h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2] | ||||
|  | ||||
|   # species | ||||
|   x <- unlist(strsplit(h$organism, "\\s+")) | ||||
|   if (length(x) >= 2) { | ||||
|     h$species <- paste(x[1], x[2]) | ||||
|   } else if (length(x) == 1) { | ||||
|     h$species <- paste(x[1], "sp.") | ||||
|   } else { | ||||
|     h$species <- NA | ||||
|   } | ||||
|  | ||||
|   # E-value | ||||
|   h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit)) | ||||
|  | ||||
|   # length of alignment | ||||
|   h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit)) | ||||
|  | ||||
|   # number of identities | ||||
|   h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit)) | ||||
|  | ||||
|   # number of gaps | ||||
|   h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit)) | ||||
|  | ||||
|   # split up alignment section | ||||
|   idx <- grep("^Query ", hit) | ||||
|   Que <- hit[idx] | ||||
|   Mid <- hit[idx + 1] | ||||
|   Sbj <- hit[idx + 2] | ||||
|  | ||||
|   # first and last positions | ||||
|   h$Qbounds <- c(start = 0, end = 0) | ||||
|   h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1])) | ||||
|   h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)])) | ||||
|  | ||||
|   h$Sbounds <- c(start = 0, end = 0) | ||||
|   h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1])) | ||||
|   h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)])) | ||||
|  | ||||
|   # aligned sequences | ||||
|   for (i in seq_along(Que)) { | ||||
|     patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string | ||||
|     m <- regexec(patt, Que[i]) | ||||
|     iFirst <- m[[1]][2] | ||||
|     iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1 | ||||
|     Que[i] <- substring(Que[i], iFirst, iLast) | ||||
|     Mid[i] <- substring(Mid[i], iFirst, iLast) | ||||
|     Sbj[i] <- substring(Sbj[i], iFirst, iLast) | ||||
|   } | ||||
|  | ||||
|   h$Qseq   <- paste0(Que, collapse = "") | ||||
|   h$midSeq <- paste0(Mid, collapse = "") | ||||
|   h$Sseq   <- paste0(Sbj, collapse = "") | ||||
|  | ||||
|   return(h) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ==== TESTS =================================================================== | ||||
|  | ||||
| if (FALSE) { | ||||
|   # define query: | ||||
|   q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain | ||||
|                "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", | ||||
|                "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", | ||||
|                sep="") | ||||
|   # or ... | ||||
|   q <- "NP_010227" # refseq ID | ||||
|  | ||||
|   test <- BLAST(q, | ||||
|                 nHits = 100, | ||||
|                 E = 0.001, | ||||
|                 rid = "", | ||||
|                 limits = "txid4751[ORGN]")  # Fungi | ||||
|   str(test) | ||||
|   length(test$hits) | ||||
| } | ||||
|  | ||||
| # [END] | ||||
|  | ||||
| # BLAST.R | ||||
| # | ||||
| # Purpose: Send off one BLAST search and return parsed list of results | ||||
| #          This script uses the BLAST URL-API | ||||
| #          (Application Programming Interface) at the NCBI. | ||||
| #          Read about the constraints here: | ||||
| #          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo | ||||
| # | ||||
| # | ||||
| # Version: 3.2 | ||||
| # Date:    2016 09 - 2020 09 | ||||
| # Author:  Boris Steipe | ||||
| # | ||||
| # Versions: | ||||
| #    3.2   2020 updates | ||||
| #    3.1   Change from require() to requireNamespace(), | ||||
| #          use <package>::<function>() idiom throughout | ||||
| #    3.0   parsing logic had not been fully implemented; Fixed. | ||||
| #    2.1   bugfix in BLAST(), bug was blanking non-split deflines; | ||||
| #          refactored parseBLASTalignment() to handle lists with multiple hits. | ||||
| #    2.0   Completely rewritten because the interface completely changed. | ||||
| #          Code adpated in part from NCBI Perl sample code: | ||||
| #          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $ | ||||
| #    1.0   first version posted for BCH441 2016, based on BLAST - API | ||||
| # | ||||
| # ToDo:    Return the organism/strain name in the output, and propagate | ||||
| #          into MYSPE selection script. | ||||
| # | ||||
| # Notes:   This is somewhat pedestrian, but apparently there are currently | ||||
| #          no R packages that contain such code. | ||||
| # | ||||
| # ============================================================================== | ||||
|  | ||||
|  | ||||
| if (! requireNamespace("httr", quietly = TRUE)) { | ||||
|   install.packages("httr") | ||||
| } | ||||
|  | ||||
|  | ||||
| BLAST <- function(Q, | ||||
|                   db = "refseq_protein", | ||||
|                   nHits = 30, | ||||
|                   E = 0.1, | ||||
|                   limits = "", | ||||
|                   rid = "", | ||||
|                   query = "", | ||||
|                   quietly = FALSE, | ||||
|                   myTimeout = 120) { | ||||
|     # Purpose: | ||||
|     #     Basic BLAST search | ||||
|     # | ||||
|     # Parameters: | ||||
|     #     Q: query - either a valid ID or a sequence | ||||
|     #     db: "refseq_protein" by default, | ||||
|     #         other legal values include: "nr", "pdb", "swissprot" ... | ||||
|     #     nHits: number of hits to maximally return | ||||
|     #     E: E-value cutoff. Do not return hits whose score would be expected | ||||
|     #        to occur E or more times in a database of random sequence. | ||||
|     #     limits: a valid ENTREZ filter | ||||
|     #     rid: a request ID - to retrieve earlier search results | ||||
|     #     query: the actual query string (needed when retrieving results | ||||
|     #            with an rid) | ||||
|     #     quietly: controls printing of wait-time progress bar | ||||
|     #     timeout: how much longer _after_ rtoe to wait for a result | ||||
|     #              before giving up (seconds) | ||||
|     # Value: | ||||
|     #     result: list of process status or resulting hits, and some metadata | ||||
|  | ||||
|  | ||||
|     EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done | ||||
|  | ||||
|     results <- list() | ||||
|     results$query = query | ||||
|     results$rid <- rid | ||||
|     results$rtoe <- 0 | ||||
|  | ||||
|     if (rid == "") {  # If no rid is available, spawn a search. | ||||
|                       # Else, proceed directly to retrieval. | ||||
|  | ||||
|       # prepare query, GET(), and parse rid and rtoe from BLAST server response | ||||
|       results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", | ||||
|                               "?", | ||||
|                               "CMD=Put", | ||||
|                               "&PROGRAM=", "blastp", | ||||
|                               "&QUERY=", URLencode(Q), | ||||
|                               "&DATABASE=", db, | ||||
|                               "&MATRIX=", "BLOSUM62", | ||||
|                               "&EXPECT=", as.character(E), | ||||
|                               "&HITLIST_SIZE=", as.character(nHits), | ||||
|                               "&ALIGNMENTS=", as.character(nHits), | ||||
|                               "&FORMAT_TYPE=Text") | ||||
|  | ||||
|       if (limits != "") { | ||||
|         results$query <- paste0( | ||||
|           results$query, | ||||
|           "&ENTREZ_QUERY=", limits) | ||||
|       } | ||||
|  | ||||
|       # send it off ... | ||||
|       response <- httr::GET(results$query) | ||||
|       if (httr::http_status(response)$category != "Success" ) { | ||||
|         stop(sprintf("PANIC: Can't send query. BLAST server status error: %s", | ||||
|                      httr::http_status(response)$message)) | ||||
|       } | ||||
|  | ||||
|       txt <- httr::content(response, "text", encoding = "UTF-8") | ||||
|  | ||||
|       patt <- "RID = (\\w+)" # match the request id | ||||
|       results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2] | ||||
|  | ||||
|       patt <- "RTOE = (\\d+)" # match the expected completion time | ||||
|       results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2]) | ||||
|  | ||||
|       # Now we wait ... | ||||
|       if (quietly) { | ||||
|         Sys.sleep(results$rtoe) | ||||
|       } else { | ||||
|         cat(sprintf("BLAST is processing %s:\n", results$rid)) | ||||
|         waitTimer(results$rtoe) | ||||
|       } | ||||
|  | ||||
|     } # done sending query and retrieving rid, rtoe | ||||
|  | ||||
|     # Enter an infinite loop to check for result availability | ||||
|     checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", | ||||
|                          "?", | ||||
|                          "CMD=Get", | ||||
|                          "&RID=", results$rid, | ||||
|                          "&FORMAT_TYPE=Text", | ||||
|                          "&FORMAT_OBJECT=SearchInfo", | ||||
|                          sep = "") | ||||
|  | ||||
|     while (TRUE) { | ||||
|       # Check whether the result is ready | ||||
|       response <- httr::GET(checkStatus) | ||||
|       if (httr::http_status(response)$category != "Success" ) { | ||||
|         stop(sprintf("PANIC: Can't check status. BLAST server status error: %s", | ||||
|                      httr::http_status(response)$message)) | ||||
|       } | ||||
|  | ||||
|       txt <- httr::content(response, "text", encoding = "UTF-8") | ||||
|  | ||||
|       if (length(grep("Status=WAITING",  txt)) > 0) { | ||||
|         myTimeout <- myTimeout - EXTRAWAIT | ||||
|  | ||||
|         if (myTimeout <= 0) { # abort | ||||
|           cat("BLAST search not concluded before timeout. Aborting.\n") | ||||
|           cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n", | ||||
|                       "Trying checking back later with >", | ||||
|                       results$rid)) | ||||
|           return(results) | ||||
|         } | ||||
|  | ||||
|         if (quietly) { | ||||
|           Sys.sleep(EXTRAWAIT) | ||||
|         } else { | ||||
|           cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)", | ||||
|                       EXTRAWAIT, | ||||
|                       myTimeout)) | ||||
|           waitTimer(EXTRAWAIT) | ||||
|           next | ||||
|         } | ||||
|  | ||||
|       } else if (length(grep("Status=FAILED",  txt)) > 0) { | ||||
|           cat("BLAST search returned status \"FAILED\". Aborting.\n") | ||||
|           return(results) | ||||
|  | ||||
|       } else if (length(grep("Status=UNKNOWN",  txt)) > 0) { | ||||
|           cat("BLAST search returned status \"UNKNOWN\".\n") | ||||
|           cat("This probably means the rid has expired. Aborting.\n") | ||||
|           return(results) | ||||
|  | ||||
|       } else if (length(grep("Status=READY",  txt)) > 0) {  # Done | ||||
|  | ||||
|           if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits | ||||
|             cat("BLAST search ready but no hits found. Aborting.\n") | ||||
|             return(results) | ||||
|  | ||||
|           } else { | ||||
|             break  # done ... retrieve search result | ||||
|           } | ||||
|       } | ||||
|     } # end result-check loop | ||||
|  | ||||
|     # retrieve results from BLAST server | ||||
|     retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", | ||||
|                       "?", | ||||
|                       "&CMD=Get", | ||||
|                       "&RID=", results$rid, | ||||
|                       "&FORMAT_TYPE=Text", | ||||
|                       sep = "") | ||||
|  | ||||
|     response <- httr::GET(retrieve) | ||||
|     if (httr::http_status(response)$category != "Success" ) { | ||||
|       stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s", | ||||
|                    httr::http_status(response)$message)) | ||||
|     } | ||||
|  | ||||
|     txt <- httr::content(response, "text", encoding = "UTF-8") | ||||
|  | ||||
|     # txt contains the whole set of results. Process: | ||||
|  | ||||
|     # First, we strsplit() on linebreaks: | ||||
|     txt <- unlist(strsplit(txt, "\n")) | ||||
|  | ||||
|     # The alignments range from the first line that begins with ">" ... | ||||
|     iFirst <- grep("^>", txt)[1] | ||||
|  | ||||
|     # ... to the last line that begins with "Sbjct" | ||||
|     x <- grep("^Sbjct", txt) | ||||
|     iLast <- x[length(x)] | ||||
|  | ||||
|     # Get the alignments block | ||||
|     txt <- txt[iFirst:iLast] | ||||
|  | ||||
|     # Drop empty lines | ||||
|     txt <- txt[!(nchar(txt) == 0)] | ||||
|  | ||||
|     # A line that ends "]" but does not begin ">" seems to be a split | ||||
|     # defline ... eg. | ||||
|     #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale " | ||||
|     #  [2] "EXF-2481]" | ||||
|     #  Merge these lines to the preceding lines and delete them. | ||||
|     # | ||||
|     x <- which(grepl("]$", txt) & !(grepl("^>", txt))) | ||||
|     if (length(x) > 0) { | ||||
|       txt[x-1] <- paste0(txt[x-1], txt[x]) | ||||
|       txt <- txt[-x] | ||||
|     } | ||||
|  | ||||
|     # Special case: there may be multiple deflines when the BLAST hit is to | ||||
|     # redundant, identical sequences. Keep only the first instance. | ||||
|     iKeep <- ! grepl("^>", txt) | ||||
|     x <- rle(iKeep) | ||||
|     x$positions <- cumsum(x$lengths) | ||||
|     i <- which(x$lengths > 1 & x$values == FALSE) | ||||
|     if (length(i) > 0) { | ||||
|       firsts <- x$positions[i] - x$lengths[i] + 1 | ||||
|       iKeep[firsts] <- TRUE | ||||
|       txt <- txt[iKeep] | ||||
|     } | ||||
|  | ||||
|     # After this preprocessing the following should be true: | ||||
|     # - Every alignment block begins with a defline in which the | ||||
|     #   first character is ">" | ||||
|     # - There is only one defline in each block. | ||||
|     # - Lines are not split. | ||||
|  | ||||
|     # Make a dataframe of first and last indices of alignment blocks | ||||
|     x <- grep("^>", txt) | ||||
|     blocks <- data.frame(iFirst = x, | ||||
|                          iLast  = c((x[-1] - 1), length(txt))) | ||||
|  | ||||
|     # Build the hits list by parsing the blocks | ||||
|     results$hits <- list() | ||||
|  | ||||
|     for (i in seq_len(nrow(blocks))) { | ||||
|       thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]] | ||||
|       results$hits[[i]] <- parseBLASTalignment(thisBlock) | ||||
|     } | ||||
|  | ||||
|     return(results) | ||||
| } | ||||
|  | ||||
| parseBLASTalignment <- function(hit) { | ||||
|   # Parse data from a character vector containing a BLAST hit | ||||
|   # Parameters: | ||||
|   #    hit  char   one BLAST hit as char vector | ||||
|   # Value: | ||||
|   #          list   $def          chr   defline | ||||
|   #                 $accession    chr   accession number | ||||
|   #                 $organism     chr   complete organism definition | ||||
|   #                 $species      chr   binomial species | ||||
|   #                 $E            num   E value | ||||
|   #                 $lengthAli    num   length of the alignment | ||||
|   #                 $nIdentitites num   number of identities | ||||
|   #                 $nGaps        num   number of gaps | ||||
|   #                 $Qbounds      num   2-element vector of query start-end | ||||
|   #                 $Sbounds      num   2-element vector of subject start-end | ||||
|   #                 $Qseq         chr   query sequence | ||||
|   #                 $midSeq       chr   midline string | ||||
|   #                 $Sseq         chr   subject sequence | ||||
|  | ||||
|   getToken <- function(patt, v) { | ||||
|     # get the first token identified by pattern patt in character vector v | ||||
|     v <- v[grep(patt, v)] | ||||
|     if (length(v) > 1) { v <- v[1] } | ||||
|     if (length(v) == 0) { token <- NA | ||||
|     } else { | ||||
|       token <- regmatches(v, regexec(patt, v))[[1]][2] } | ||||
|     return(token) | ||||
|   } | ||||
|  | ||||
|   h <- list() | ||||
|  | ||||
|   # FASTA defline | ||||
|   h$def <- hit[1] | ||||
|  | ||||
|   # accesion number (ID), use the first if there are several, separated by "|" | ||||
|   patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|" | ||||
|   h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2] | ||||
|  | ||||
|   # organism | ||||
|   patt <- "\\[(.+)]" | ||||
|   h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2] | ||||
|  | ||||
|   # species | ||||
|   x <- unlist(strsplit(h$organism, "\\s+")) | ||||
|   if (length(x) >= 2) { | ||||
|     h$species <- paste(x[1], x[2]) | ||||
|   } else if (length(x) == 1) { | ||||
|     h$species <- paste(x[1], "sp.") | ||||
|   } else { | ||||
|     h$species <- NA | ||||
|   } | ||||
|  | ||||
|   # E-value | ||||
|   h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit)) | ||||
|  | ||||
|   # length of alignment | ||||
|   h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit)) | ||||
|  | ||||
|   # number of identities | ||||
|   h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit)) | ||||
|  | ||||
|   # number of gaps | ||||
|   h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit)) | ||||
|  | ||||
|   # split up alignment section | ||||
|   idx <- grep("^Query ", hit) | ||||
|   Que <- hit[idx] | ||||
|   Mid <- hit[idx + 1] | ||||
|   Sbj <- hit[idx + 2] | ||||
|  | ||||
|   # first and last positions | ||||
|   h$Qbounds <- c(start = 0, end = 0) | ||||
|   h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1])) | ||||
|   h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)])) | ||||
|  | ||||
|   h$Sbounds <- c(start = 0, end = 0) | ||||
|   h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1])) | ||||
|   h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)])) | ||||
|  | ||||
|   # aligned sequences | ||||
|   for (i in seq_along(Que)) { | ||||
|     patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string | ||||
|     m <- regexec(patt, Que[i]) | ||||
|     iFirst <- m[[1]][2] | ||||
|     iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1 | ||||
|     Que[i] <- substring(Que[i], iFirst, iLast) | ||||
|     Mid[i] <- substring(Mid[i], iFirst, iLast) | ||||
|     Sbj[i] <- substring(Sbj[i], iFirst, iLast) | ||||
|   } | ||||
|  | ||||
|   h$Qseq   <- paste0(Que, collapse = "") | ||||
|   h$midSeq <- paste0(Mid, collapse = "") | ||||
|   h$Sseq   <- paste0(Sbj, collapse = "") | ||||
|  | ||||
|   return(h) | ||||
| } | ||||
|  | ||||
|  | ||||
| # ==== TESTS =================================================================== | ||||
|  | ||||
| if (FALSE) { | ||||
|   # define query: | ||||
|   q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain | ||||
|                "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", | ||||
|                "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", | ||||
|                sep="") | ||||
|   # or ... | ||||
|   q <- "NP_010227" # refseq ID | ||||
|  | ||||
|   test <- BLAST(q, | ||||
|                 nHits = 100, | ||||
|                 E = 0.001, | ||||
|                 rid = "", | ||||
|                 limits = "txid4751[ORGN]")  # Fungi | ||||
|   str(test) | ||||
|   length(test$hits) | ||||
| } | ||||
|  | ||||
| # [END] | ||||
|  | ||||
|   | ||||
| @@ -1,32 +1,32 @@ | ||||
| # test_biCode.R | ||||
| # | ||||
|  | ||||
| context("biCode() utility function tests")  # A set of tests for some | ||||
|                                             # functionality | ||||
|  | ||||
| test_that("expected input is processed correctly", {  # Related expectations | ||||
|   expect_equal(biCode("homo sapiens"), "HOMSA") | ||||
|   expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") | ||||
|   expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), | ||||
|                c("PHACI", "MACRU")) | ||||
| }) | ||||
|  | ||||
| test_that("unexpected input is managed", { | ||||
|   expect_equal(biCode(""), ".....") | ||||
|   expect_equal(biCode(" "), ".....") | ||||
|   expect_equal(biCode("123 12"), ".....") | ||||
|   expect_equal(biCode("h sapiens"), "H..SA") | ||||
| }) | ||||
|  | ||||
| test_that("NA values are preserved", { | ||||
|   expect_true(is.na((biCode(NA)))) | ||||
|   expect_equal(biCode(c("first", NA, "last")), | ||||
|                c("FIRST", NA, "LAST.")) | ||||
| }) | ||||
|  | ||||
| test_that("Missing argument throws an error", { | ||||
|   expect_error(biCode(), "argument \"s\" is missing, with no default") | ||||
| }) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
| # test_biCode.R | ||||
| # | ||||
|  | ||||
| context("biCode() utility function tests")  # A set of tests for some | ||||
|                                             # functionality | ||||
|  | ||||
| test_that("expected input is processed correctly", {  # Related expectations | ||||
|   expect_equal(biCode("homo sapiens"), "HOMSA") | ||||
|   expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") | ||||
|   expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), | ||||
|                c("PHACI", "MACRU")) | ||||
| }) | ||||
|  | ||||
| test_that("unexpected input is managed", { | ||||
|   expect_equal(biCode(""), ".....") | ||||
|   expect_equal(biCode(" "), ".....") | ||||
|   expect_equal(biCode("123 12"), ".....") | ||||
|   expect_equal(biCode("h sapiens"), "H..SA") | ||||
| }) | ||||
|  | ||||
| test_that("NA values are preserved", { | ||||
|   expect_true(is.na((biCode(NA)))) | ||||
|   expect_equal(biCode(c("first", NA, "last")), | ||||
|                c("FIRST", NA, "LAST.")) | ||||
| }) | ||||
|  | ||||
| test_that("Missing argument throws an error", { | ||||
|   expect_error(biCode(), "argument \"s\" is missing, with no default") | ||||
| }) | ||||
|  | ||||
|  | ||||
| # [END] | ||||
|   | ||||
		Reference in New Issue
	
	Block a user