Line termination change and old code.
This commit is contained in:
		
							
								
								
									
										258
									
								
								.Rprofile
									
									
									
									
									
								
							
							
						
						
									
										258
									
								
								.Rprofile
									
									
									
									
									
								
							| @@ -1,129 +1,129 @@ | |||||||
| # .Rprofile | # .Rprofile | ||||||
| # | # | ||||||
| # This script is automatically executed on startup | # This script is automatically executed on startup | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
| init <- function() { | init <- function() { | ||||||
|  |  | ||||||
|   # Create a local copy of myScript.R if not done yet. |   # Create a local copy of myScript.R if not done yet. | ||||||
|   if (! file.exists("myScript.R") && file.exists(".tmp.R")) { |   if (! file.exists("myScript.R") && file.exists(".tmp.R")) { | ||||||
|     file.copy(".tmp.R", "myScript.R") |     file.copy(".tmp.R", "myScript.R") | ||||||
|     cat("A new file \"myScript.R\" was created. You can use it for\n") |     cat("A new file \"myScript.R\" was created. You can use it for\n") | ||||||
|     cat("notes and code experiments.\n\n") |     cat("notes and code experiments.\n\n") | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   cat("\n\n") |   cat("\n\n") | ||||||
|   cat("Please open the file \".myProfile.R\" (click on the file-name in the\n") |   cat("Please open the file \".myProfile.R\" (click on the file-name in the\n") | ||||||
|   cat("\"files\" pane), edit it and save it.\n") |   cat("\"files\" pane), edit it and save it.\n") | ||||||
|   cat("Then click the checkbox, and use the More -> Move... dialogue\n") |   cat("Then click the checkbox, and use the More -> Move... dialogue\n") | ||||||
|   cat("to move it into the \"myScripts\" folder.\n\n") |   cat("to move it into the \"myScripts\" folder.\n\n") | ||||||
|  |  | ||||||
|   file.edit("ABC-units.R") |   file.edit("ABC-units.R") | ||||||
|   return(invisible(NULL)) |   return(invisible(NULL)) | ||||||
| } | } | ||||||
|  |  | ||||||
| if (! file.exists("./myScripts/.myProfile.R")) { | if (! file.exists("./myScripts/.myProfile.R")) { | ||||||
|   cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n") |   cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n") | ||||||
|   cat("    =================") |   cat("    =================") | ||||||
|   cat("\n\n") |   cat("\n\n") | ||||||
|   cat("        WELCOME !\n") |   cat("        WELCOME !\n") | ||||||
|   cat("\n") |   cat("\n") | ||||||
|   cat("  Type  'init()'  to begin\n\n") |   cat("  Type  'init()'  to begin\n\n") | ||||||
|   cat("\n") |   cat("\n") | ||||||
|   cat("    =================") |   cat("    =================") | ||||||
|   cat("\n\n") |   cat("\n\n") | ||||||
|  |  | ||||||
| } else {  # local profile exists ... validate state: | } else {  # local profile exists ... validate state: | ||||||
|   cat("\n\nLoading local functions ...") |   cat("\n\nLoading local functions ...") | ||||||
|  |  | ||||||
|   source(".utilities.R")  # local profile appears sane, source utilities |   source(".utilities.R")  # local profile appears sane, source utilities | ||||||
|   source("./myScripts/.myProfile.R") |   source("./myScripts/.myProfile.R") | ||||||
|  |  | ||||||
|   if (! exists("myEMail")) {  # ... has eMail been defined? |   if (! exists("myEMail")) {  # ... has eMail been defined? | ||||||
|     cat("ERROR !\n") |     cat("ERROR !\n") | ||||||
|     cat("=======\n") |     cat("=======\n") | ||||||
|     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") |     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") | ||||||
|     cat("the variable \"myEMail\" was not loaded.\n") |     cat("the variable \"myEMail\" was not loaded.\n") | ||||||
|     cat("Please contact your instructor to continue.\n\n") |     cat("Please contact your instructor to continue.\n\n") | ||||||
|   } |   } | ||||||
|   if (! exists("myStudentNumber")) {  # ... has the Student Number been defined? |   if (! exists("myStudentNumber")) {  # ... has the Student Number been defined? | ||||||
|     cat("ERROR !\n") |     cat("ERROR !\n") | ||||||
|     cat("=======\n") |     cat("=======\n") | ||||||
|     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") |     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") | ||||||
|     cat("the variable \"myStudentNumber\" was not loaded.\n") |     cat("the variable \"myStudentNumber\" was not loaded.\n") | ||||||
|     cat("Please contact your instructor to continue.\n\n") |     cat("Please contact your instructor to continue.\n\n") | ||||||
|   } |   } | ||||||
|   if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) { |   if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) { | ||||||
|     cat("ERROR !\n")                 # is the Student Number valid? |     cat("ERROR !\n")                 # is the Student Number valid? | ||||||
|     cat("=======\n") |     cat("=======\n") | ||||||
|     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") |     cat("The file \"./myScripts/.myProfile.R\" exists, but\n") | ||||||
|     cat("your Student Number could not be validated.\n") |     cat("your Student Number could not be validated.\n") | ||||||
|     cat("Please examine the file \"./myScripts/.myProfile.R\"\n") |     cat("Please examine the file \"./myScripts/.myProfile.R\"\n") | ||||||
|     cat(" and fix the problem or contact your instructor to continue.\n\n") |     cat(" and fix the problem or contact your instructor to continue.\n\n") | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now |   if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now | ||||||
|                             # ... and write it into the profile. |                             # ... and write it into the profile. | ||||||
|        prf <- readLines("./myScripts/.myProfile.R") |        prf <- readLines("./myScripts/.myProfile.R") | ||||||
|        iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf) |        iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf) | ||||||
|        out <- prf[1:iEmail] |        out <- prf[1:iEmail] | ||||||
|        out <- c(out, sprintf("MYSPE <- \"%s\" ", |        out <- c(out, sprintf("MYSPE <- \"%s\" ", | ||||||
|                              getMYSPE(myStudentNumber))) |                              getMYSPE(myStudentNumber))) | ||||||
|        out <- c(out, prf[(iEmail+1):length(prf)]) |        out <- c(out, prf[(iEmail+1):length(prf)]) | ||||||
|        writeLines(out, "./myScripts/.myProfile.R") |        writeLines(out, "./myScripts/.myProfile.R") | ||||||
|  |  | ||||||
|        cat("\n") |        cat("\n") | ||||||
|        cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n", |        cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n", | ||||||
|                    getMYSPE(myStudentNumber))) |                    getMYSPE(myStudentNumber))) | ||||||
|        MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use |        MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use | ||||||
|        rm(prf, iEmail, out)                # cleanup |        rm(prf, iEmail, out)                # cleanup | ||||||
|   } |   } | ||||||
|   cat("... done.\n\n") |   cat("... done.\n\n") | ||||||
| } | } | ||||||
|  |  | ||||||
| if (default.stringsAsFactors()) { | if (default.stringsAsFactors()) { | ||||||
|   cat("WARNING.\n") |   cat("WARNING.\n") | ||||||
|   cat("========\n") |   cat("========\n") | ||||||
|   cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n") |   cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n") | ||||||
|   cat("This will break some of the code.\n") |   cat("This will break some of the code.\n") | ||||||
|   cat("Please contact your instructor to troubleshoot and fix this issue.\n") |   cat("Please contact your instructor to troubleshoot and fix this issue.\n") | ||||||
|   cat("\n") |   cat("\n") | ||||||
| } | } | ||||||
|  |  | ||||||
| errText <- list() | errText <- list() | ||||||
| errText[["noProfileFile"]] <- ' | errText[["noProfileFile"]] <- ' | ||||||
| Your PROFILE FILE does not exist. This problem must be fixed to continue. | Your PROFILE FILE does not exist. This problem must be fixed to continue. | ||||||
|  |  | ||||||
|   The code expects the file "./myScripts/.myProfile.R" to exist and to |   The code expects the file "./myScripts/.myProfile.R" to exist and to | ||||||
|   contain your correct eMail address and student number. Detailed |   contain your correct eMail address and student number. Detailed | ||||||
|   instructions were given when you first ran the init() command. |   instructions were given when you first ran the init() command. | ||||||
|  |  | ||||||
|   Try running init() again and follow the instructions. Reload youR RStudio |   Try running init() again and follow the instructions. Reload youR RStudio | ||||||
|   session and start over with this file. |   session and start over with this file. | ||||||
|  |  | ||||||
|   If this does not fix the problem, ask for help. |   If this does not fix the problem, ask for help. | ||||||
| ' | ' | ||||||
|  |  | ||||||
| errText[["noStudentNumber"]] <- ' | errText[["noStudentNumber"]] <- ' | ||||||
| Your STUDENT NUMBER has not been defined. This problem must be fixed to continue. | Your STUDENT NUMBER has not been defined. This problem must be fixed to continue. | ||||||
|  |  | ||||||
|   The code expects the file "./myScripts/.myProfile.R" to exist and to |   The code expects the file "./myScripts/.myProfile.R" to exist and to | ||||||
|   contain your correct eMail address and student number. This file gets |   contain your correct eMail address and student number. This file gets | ||||||
|   sourced when you start a new R-session, but since you see this error |   sourced when you start a new R-session, but since you see this error | ||||||
|   message there was a problem. |   message there was a problem. | ||||||
|  |  | ||||||
|   Perhaps you need to restart your R-session. Try closing the RStudio |   Perhaps you need to restart your R-session. Try closing the RStudio | ||||||
|   project and reopening it from the File > Recent Projects menu. |   project and reopening it from the File > Recent Projects menu. | ||||||
|  |  | ||||||
|   Perhaps there was a syntax error in your file. Then not all the |   Perhaps there was a syntax error in your file. Then not all the | ||||||
|   instructions in the file are executed. Check the file: is your |   instructions in the file are executed. Check the file: is your | ||||||
|   email perhpas not defined? Or did you type it without qwuoataion |   email perhpas not defined? Or did you type it without qwuoataion | ||||||
|   marks? |   marks? | ||||||
|  |  | ||||||
|   Try fixing problems, and then restart R as described above. |   Try fixing problems, and then restart R as described above. | ||||||
|  |  | ||||||
|   If none of this fixes the problem, ask for help. |   If none of this fixes the problem, ask for help. | ||||||
| ' | ' | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										88
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										88
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,44 +1,44 @@ | |||||||
| # Miscellaneous | # Miscellaneous | ||||||
| .Ds_store | .Ds_store | ||||||
| instructor/ | instructor/ | ||||||
| dev/ | dev/ | ||||||
| # myScripts/ # We don't want to ignore this so we can save our work to our own fork. | # myScripts/ # We don't want to ignore this so we can save our work to our own fork. | ||||||
|  |  | ||||||
| # History files | # History files | ||||||
| .Rhistory | .Rhistory | ||||||
| .Rapp.history | .Rapp.history | ||||||
|  |  | ||||||
| # Session Data files | # Session Data files | ||||||
| # .RData | # .RData | ||||||
|  |  | ||||||
| # Files produced in assingments | # Files produced in assingments | ||||||
| data/APSESphyloSet.mfa | data/APSESphyloSet.mfa | ||||||
| data/APSEStreeRproml.rds | data/APSEStreeRproml.rds | ||||||
|  |  | ||||||
| # Example code in package build process | # Example code in package build process | ||||||
| *-Ex.R | *-Ex.R | ||||||
|  |  | ||||||
| # Output files from R CMD build | # Output files from R CMD build | ||||||
| /*.tar.gz | /*.tar.gz | ||||||
|  |  | ||||||
| # Output files from R CMD check | # Output files from R CMD check | ||||||
| /*.Rcheck/ | /*.Rcheck/ | ||||||
|  |  | ||||||
| # RStudio files | # RStudio files | ||||||
| .Rproj.user/ | .Rproj.user/ | ||||||
|  |  | ||||||
| # produced vignettes | # produced vignettes | ||||||
| vignettes/*.html | vignettes/*.html | ||||||
| vignettes/*.pdf | vignettes/*.pdf | ||||||
|  |  | ||||||
| # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 | ||||||
| .httr-oauth | .httr-oauth | ||||||
|  |  | ||||||
| # knitr and R markdown default cache directories | # knitr and R markdown default cache directories | ||||||
| /*_cache/ | /*_cache/ | ||||||
| /cache/ | /cache/ | ||||||
|  |  | ||||||
| # Temporary files created by R markdown | # Temporary files created by R markdown | ||||||
| *.utf8.md | *.utf8.md | ||||||
| *.knit.md | *.knit.md | ||||||
| .Rproj.user | .Rproj.user | ||||||
|   | |||||||
							
								
								
									
										76
									
								
								.tmp.R
									
									
									
									
									
								
							
							
						
						
									
										76
									
								
								.tmp.R
									
									
									
									
									
								
							| @@ -1,38 +1,38 @@ | |||||||
| # myScript.R | # myScript.R | ||||||
| # | # | ||||||
| # --- As you work with this file, you can delete the instructions below -------- | # --- As you work with this file, you can delete the instructions below -------- | ||||||
| # Write your notes and code experiments into this document. Save it | # Write your notes and code experiments into this document. Save it | ||||||
| # from time to time - however I recommend that you do not _commit_ | # from time to time - however I recommend that you do not _commit_ | ||||||
| # your saved version. | # your saved version. | ||||||
| # | # | ||||||
| # As long as you do not _commit_ this script to version control, | # As long as you do not _commit_ this script to version control, | ||||||
| # you can _pull_ updated versions of the entire project from GitHub | # you can _pull_ updated versions of the entire project from GitHub | ||||||
| # by using the RStudio version control interface. However, once | # by using the RStudio version control interface. However, once | ||||||
| # you _commit_ any file in your local version, RStudio will require | # you _commit_ any file in your local version, RStudio will require | ||||||
| # you to resolve conflicts before you can _pull_ updates. | # you to resolve conflicts before you can _pull_ updates. | ||||||
| # --- As you work with this file, you can delete the instructions above -------- | # --- As you work with this file, you can delete the instructions above -------- | ||||||
| # | # | ||||||
| ## Purpose: <...> | ## Purpose: <...> | ||||||
| # | # | ||||||
| # Version: <...> | # Version: <...> | ||||||
| # | # | ||||||
| # Date:    <...> | # Date:    <...> | ||||||
| # Author:  <Name> (<namee@mail.utoronto.ca>) | # Author:  <Name> (<namee@mail.utoronto.ca>) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| # | # | ||||||
| #   <number>    <Features> | #   <number>    <Features> | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| #   <...> | #   <...> | ||||||
| # | # | ||||||
| # ==================================================================== | # ==================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										1308
									
								
								.utilities.R
									
									
									
									
									
								
							
							
						
						
									
										1308
									
								
								.utilities.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,257 +1,257 @@ | |||||||
| # 2021-10-12_In-Class_exploration.R | # 2021-10-12_In-Class_exploration.R | ||||||
| # | # | ||||||
| #         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D ===== | #         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D ===== | ||||||
| # | # | ||||||
| # Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12 | # Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12 | ||||||
| # Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu | # Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu | ||||||
| # Scribe:     boris.steipe@utoronto.ca | # Scribe:     boris.steipe@utoronto.ca | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
| # | # | ||||||
| # In our last session we explored some properties of amino acids and noted that | # In our last session we explored some properties of amino acids and noted that | ||||||
| # we can arrange them in a scatter-plot according to some properties. But can | # we can arrange them in a scatter-plot according to some properties. But can | ||||||
| # we also arrange them according to generic properties, i.e. taking all | # we also arrange them according to generic properties, i.e. taking all | ||||||
| # published property scales into account? We will try to use all tables from | # published property scales into account? We will try to use all tables from | ||||||
| # the seqinr package. | # the seqinr package. | ||||||
|  |  | ||||||
| # First we load the package - this makes all datasets immediately available and | # First we load the package - this makes all datasets immediately available and | ||||||
| # we don't have to load them one by one. | # we don't have to load them one by one. | ||||||
|  |  | ||||||
| library(seqinr) | library(seqinr) | ||||||
|  |  | ||||||
| # Determine what datasets are available | # Determine what datasets are available | ||||||
| # | # | ||||||
| # Using "find in topic" ... "amino acid" | # Using "find in topic" ... "amino acid" | ||||||
| data(aacost) | data(aacost) | ||||||
| data(aaindex) | data(aaindex) | ||||||
| data(pK) | data(pK) | ||||||
|  |  | ||||||
| # We note that datasets may be sorted in different ways: for example | # We note that datasets may be sorted in different ways: for example | ||||||
| # alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala, | # alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala, | ||||||
| # Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino | # Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino | ||||||
| # acids are sorted in the same way. | # acids are sorted in the same way. | ||||||
|  |  | ||||||
| # Build a datastructure ... | # Build a datastructure ... | ||||||
| # rows: amino acids | # rows: amino acids | ||||||
| # columns: properties | # columns: properties | ||||||
|  |  | ||||||
| # Are all lists in aaindex organized in the same way? | # Are all lists in aaindex organized in the same way? | ||||||
|  |  | ||||||
| refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item | refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item | ||||||
|                                   # index as a reference list |                                   # index as a reference list | ||||||
|  |  | ||||||
| # Loop over each list in aaindex | # Loop over each list in aaindex | ||||||
| for (i in 1:length(aaindex)) { | for (i in 1:length(aaindex)) { | ||||||
| #   get the I-vector | #   get the I-vector | ||||||
|   x <- aaindex[[i]]$I |   x <- aaindex[[i]]$I | ||||||
| #   get the names | #   get the names | ||||||
|   x <- names(x) |   x <- names(x) | ||||||
| #   compare with the names of our reference list | #   compare with the names of our reference list | ||||||
| #   the == and != operators are vectorized. Applying them to two vectors | #   the == and != operators are vectorized. Applying them to two vectors | ||||||
| #   gives TRUE or FALSE for each pair of elements. any() or all() can be | #   gives TRUE or FALSE for each pair of elements. any() or all() can be | ||||||
| #   applied to logical vectors to anylise them and return a soingle result. | #   applied to logical vectors to anylise them and return a soingle result. | ||||||
| #   if (...) conditions evaluate only a single value and will throw a warning if | #   if (...) conditions evaluate only a single value and will throw a warning if | ||||||
| #   there is more than one. | #   there is more than one. | ||||||
|  |  | ||||||
|   if (any(x != refNames)) { |   if (any(x != refNames)) { | ||||||
|     # There was at least one not-equal pair - so: complain |     # There was at least one not-equal pair - so: complain | ||||||
|     print(sprintf("Problem in list %d: names don't match", i)) |     print(sprintf("Problem in list %d: names don't match", i)) | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| # If we get here without identifying problems, it means all pairs of | # If we get here without identifying problems, it means all pairs of | ||||||
| # rownames match throughout the aainfex list. | # rownames match throughout the aainfex list. | ||||||
|  |  | ||||||
|  |  | ||||||
| # Next: what is the cvorrect syntax to add one vector (the "I" vector of | # Next: what is the cvorrect syntax to add one vector (the "I" vector of | ||||||
| # one of the list elements) to our dataframe? | # one of the list elements) to our dataframe? | ||||||
| aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index | aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index | ||||||
| aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index | aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index | ||||||
|  |  | ||||||
| str(aaData)  # Confirm: we now have a two-column dataframe | str(aaData)  # Confirm: we now have a two-column dataframe | ||||||
|  |  | ||||||
| # Next: add the rest ... | # Next: add the rest ... | ||||||
| for (i in 3:length(aaindex)) { | for (i in 3:length(aaindex)) { | ||||||
|   #   get the I-vector and write it into our dataframe |   #   get the I-vector and write it into our dataframe | ||||||
|   aaData[,i] <- aaindex[[i]]$I |   aaData[,i] <- aaindex[[i]]$I | ||||||
| } | } | ||||||
|  |  | ||||||
| # Sanity check | # Sanity check | ||||||
| plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other | plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other | ||||||
|  |  | ||||||
| # Looks good. | # Looks good. | ||||||
|  |  | ||||||
| # We finished building our data structure ... but let's add the aacost table | # We finished building our data structure ... but let's add the aacost table | ||||||
| # aacost is ordered differently: | # aacost is ordered differently: | ||||||
| rownames(aaData) | rownames(aaData) | ||||||
| aacost[ , 1] | aacost[ , 1] | ||||||
|  |  | ||||||
| # using order(), applied to aacost - ordering the column with column-name | # using order(), applied to aacost - ordering the column with column-name | ||||||
| # "aaa" | # "aaa" | ||||||
| sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes | sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes | ||||||
| aacost[sel, "aaa"] # applying the order vector sorts the column | aacost[sel, "aaa"] # applying the order vector sorts the column | ||||||
|  |  | ||||||
| # Is this the same order as refNames? | # Is this the same order as refNames? | ||||||
| refNames == aacost[sel, "aaa"]  # Yes! | refNames == aacost[sel, "aaa"]  # Yes! | ||||||
|  |  | ||||||
| # add the data from column "tot" (i.e. total metabolic cost) after the | # add the data from column "tot" (i.e. total metabolic cost) after the | ||||||
| # last column of aaData | # last column of aaData | ||||||
| aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"] | aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"] | ||||||
|  |  | ||||||
| # Done. | # Done. | ||||||
| str(aaData)  # A dataframe with 20 rows and 545 columns | str(aaData)  # A dataframe with 20 rows and 545 columns | ||||||
|  |  | ||||||
| # To answer the question "Which amino acids are similar to each other?" we | # To answer the question "Which amino acids are similar to each other?" we | ||||||
| # need to reduce this 545-dimensional dataset to fewer dimensions, otherwise | # need to reduce this 545-dimensional dataset to fewer dimensions, otherwise | ||||||
| # we will succumb to the "Curse of Dimensionality": | # we will succumb to the "Curse of Dimensionality": | ||||||
| # | # | ||||||
| #    "in high dimensional data, however, all objects appear | #    "in high dimensional data, however, all objects appear | ||||||
| #     to be sparse and dissimilar in many ways..." | #     to be sparse and dissimilar in many ways..." | ||||||
| #                   https://en.wikipedia.org/wiki/Curse_of_dimensionality | #                   https://en.wikipedia.org/wiki/Curse_of_dimensionality | ||||||
| # | # | ||||||
| # A classic way to do this is Principal Component Analysis (PCA) ... | # A classic way to do this is Principal Component Analysis (PCA) ... | ||||||
| # (Principal components analysis) | # (Principal components analysis) | ||||||
| # | # | ||||||
| # PCA expects objects in columns, properties in rows. Therefore we need to | # PCA expects objects in columns, properties in rows. Therefore we need to | ||||||
| # transpose our dataset: | # transpose our dataset: | ||||||
|  |  | ||||||
| aaPCA <- prcomp(t(aaData)) | aaPCA <- prcomp(t(aaData)) | ||||||
|  |  | ||||||
| # This creates an error, because some of our indicews contain NA values! | # This creates an error, because some of our indicews contain NA values! | ||||||
| # Which indices are this? | # Which indices are this? | ||||||
|  |  | ||||||
| # We create a vector "sel" for which we check whether any element in each | # We create a vector "sel" for which we check whether any element in each | ||||||
| # column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can | # column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can | ||||||
| # then use this vector to subset ourt dataframe. | # then use this vector to subset ourt dataframe. | ||||||
|  |  | ||||||
| sel <- logical() | sel <- logical() | ||||||
|  |  | ||||||
| for (i in 1:ncol(aaData)) {         # for each index | for (i in 1:ncol(aaData)) {         # for each index | ||||||
|   if (any(is.na(aaData[,i]))) {     #   if there is any NA value ... |   if (any(is.na(aaData[,i]))) {     #   if there is any NA value ... | ||||||
|     sel <- c(sel, FALSE)            #     add a FALSE element to the vector |     sel <- c(sel, FALSE)            #     add a FALSE element to the vector | ||||||
|   } else {                          #   else |   } else {                          #   else | ||||||
|     sel <- c(sel, TRUE)             #     add a TRUE element |     sel <- c(sel, TRUE)             #     add a TRUE element | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| # Done. sel now subsets only the NA-free columns | # Done. sel now subsets only the NA-free columns | ||||||
| 545 - sum(sel)                      # 13 columns excluded | 545 - sum(sel)                      # 13 columns excluded | ||||||
|  |  | ||||||
| # Do the PCA ... use the prcomp() function | # Do the PCA ... use the prcomp() function | ||||||
| aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set | aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set | ||||||
|  |  | ||||||
| str(aaPCA)   # structure of the result | str(aaPCA)   # structure of the result | ||||||
|  |  | ||||||
| plot(aaPCA)                         # plot the contributions of the | plot(aaPCA)                         # plot the contributions of the | ||||||
|                                     # components to the variance |                                     # components to the variance | ||||||
|  |  | ||||||
| plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC | plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC | ||||||
|      aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame |      aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame | ||||||
|      type ="n")                     # just to set up the coordinate system |      type ="n")                     # just to set up the coordinate system | ||||||
|  |  | ||||||
| text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into | text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into | ||||||
|      aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions |      aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions | ||||||
|      labels = rownames(aaPCA$rotation)) |      labels = rownames(aaPCA$rotation)) | ||||||
|  |  | ||||||
| # PCA results are sensitive to the absolute numeric value of the features that | # PCA results are sensitive to the absolute numeric value of the features that | ||||||
| # we are comparing. The prcomp() function has an option scale. = TRUE that | # we are comparing. The prcomp() function has an option scale. = TRUE that | ||||||
| # scales each row of features so that the variance of the value is 1.0  This | # scales each row of features so that the variance of the value is 1.0  This | ||||||
| # ensures that each feature is given approximately equal weight | # ensures that each feature is given approximately equal weight | ||||||
|  |  | ||||||
| aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE) | aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE) | ||||||
|  |  | ||||||
| plot(aaPCA) | plot(aaPCA) | ||||||
|  |  | ||||||
| plot(aaPCA$rotation[ , 1], | plot(aaPCA$rotation[ , 1], | ||||||
|      aaPCA$rotation[ , 2], |      aaPCA$rotation[ , 2], | ||||||
|      type ="n") |      type ="n") | ||||||
| text(aaPCA$rotation[ , 1], | text(aaPCA$rotation[ , 1], | ||||||
|      aaPCA$rotation[ , 2], |      aaPCA$rotation[ , 2], | ||||||
|      labels = rownames(aaPCA$rotation)) |      labels = rownames(aaPCA$rotation)) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Next we try to identify what the PCs correspond to. We see whether there are | # Next we try to identify what the PCs correspond to. We see whether there are | ||||||
| # specific features that are highly correlated with the PCs | # specific features that are highly correlated with the PCs | ||||||
|  |  | ||||||
| # ==== Rotation 1 =================== | # ==== Rotation 1 =================== | ||||||
| # | # | ||||||
|  |  | ||||||
| (PC1 <- aaPCA$rotation[ , 1])  # Assign PC1 | (PC1 <- aaPCA$rotation[ , 1])  # Assign PC1 | ||||||
|  |  | ||||||
| # The function cor() calculates Pearson coefficients of correlation | # The function cor() calculates Pearson coefficients of correlation | ||||||
| cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37 | cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37 | ||||||
|  |  | ||||||
|  |  | ||||||
| # Iterate over all columns and calculate correlations | # Iterate over all columns and calculate correlations | ||||||
| cors <- numeric() | cors <- numeric() | ||||||
|  |  | ||||||
| for (i in 1:ncol(aaData)) { | for (i in 1:ncol(aaData)) { | ||||||
|   cors[i] <- cor(PC1, aaData[ , i]) |   cors[i] <- cor(PC1, aaData[ , i]) | ||||||
| } | } | ||||||
|  |  | ||||||
| summary(cors) | summary(cors) | ||||||
| #    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's | #    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's | ||||||
| # -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13 | # -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13 | ||||||
| # | # | ||||||
| #  The max correlation is ~0.6. That is not very high. Which ijndex is it? | #  The max correlation is ~0.6. That is not very high. Which ijndex is it? | ||||||
|  |  | ||||||
| which(cors == max(cors, na.rm = TRUE)) | which(cors == max(cors, na.rm = TRUE)) | ||||||
|  |  | ||||||
| aaindex[[504]]   # Linker propensity ??? | aaindex[[504]]   # Linker propensity ??? | ||||||
|  |  | ||||||
| cor(PC1, aaindex[[504]]$I) # Did we get the right index? | cor(PC1, aaindex[[504]]$I) # Did we get the right index? | ||||||
|  |  | ||||||
| # Plot this ... | # Plot this ... | ||||||
| plot(aaPCA$rotation[ , 1], | plot(aaPCA$rotation[ , 1], | ||||||
|      aaindex[[504]]$I, |      aaindex[[504]]$I, | ||||||
|      type ="n") |      type ="n") | ||||||
| text(aaPCA$rotation[ , 1], | text(aaPCA$rotation[ , 1], | ||||||
|      aaindex[[504]]$I, |      aaindex[[504]]$I, | ||||||
|      labels = rownames(aaPCA$rotation)) |      labels = rownames(aaPCA$rotation)) | ||||||
|  |  | ||||||
| # This is essentially a random correlation but for Cysteine ... | # This is essentially a random correlation but for Cysteine ... | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==== Rotation 2 =================== | # ==== Rotation 2 =================== | ||||||
| # | # | ||||||
| # same process | # same process | ||||||
| PC2 <- aaPCA$rotation[ , 2] | PC2 <- aaPCA$rotation[ , 2] | ||||||
|  |  | ||||||
| cors2 <- numeric() | cors2 <- numeric() | ||||||
|  |  | ||||||
| for (i in 1:ncol(aaData)) { | for (i in 1:ncol(aaData)) { | ||||||
|   cors2[i] <- cor(PC2, aaData[ , i]) |   cors2[i] <- cor(PC2, aaData[ , i]) | ||||||
| } | } | ||||||
|  |  | ||||||
| summary(cors2) | summary(cors2) | ||||||
| #     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's | #     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's | ||||||
| # -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13 | # -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13 | ||||||
|  |  | ||||||
| # Here we have quite strong correlations | # Here we have quite strong correlations | ||||||
|  |  | ||||||
| which(cors2 == max(cors2, na.rm = TRUE)) | which(cors2 == max(cors2, na.rm = TRUE)) | ||||||
|  |  | ||||||
| aaindex[[148]] | aaindex[[148]] | ||||||
|  |  | ||||||
| # this index itself is correlated with many other indices | # this index itself is correlated with many other indices | ||||||
|  |  | ||||||
| cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index | cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index | ||||||
|  |  | ||||||
| # Plot this too... | # Plot this too... | ||||||
| plot(aaPCA$rotation[ , 2], | plot(aaPCA$rotation[ , 2], | ||||||
|      aaindex[[148]]$I, |      aaindex[[148]]$I, | ||||||
|      type ="n") |      type ="n") | ||||||
| text(aaPCA$rotation[ , 2], | text(aaPCA$rotation[ , 2], | ||||||
|      aaindex[[148]]$I, |      aaindex[[148]]$I, | ||||||
|      labels = rownames(aaPCA$rotation)) |      labels = rownames(aaPCA$rotation)) | ||||||
|  |  | ||||||
| # This correlates well with hydrophobicity measures. In this case the | # This correlates well with hydrophobicity measures. In this case the | ||||||
| # PC is to a certain degree interpretable - but this is not always the case | # PC is to a certain degree interpretable - but this is not always the case | ||||||
| # with PCA (see the example of the first PC). | # with PCA (see the example of the first PC). | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,161 +1,161 @@ | |||||||
| # tocID <- "ABC-Install_all_packages.R" | # tocID <- "ABC-Install_all_packages.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              Installing all packages in this course | #              Installing all packages in this course | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.0 | ||||||
| # | # | ||||||
| # Date:     2021  10 | # Date:     2021  10 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.0    New code | #           1.0    New code | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC> | #TOC> | ||||||
| #TOC>   Section  Title                          Line | #TOC>   Section  Title                          Line | ||||||
| #TOC> ---------------------------------------------- | #TOC> ---------------------------------------------- | ||||||
| #TOC>   1        Packages                         33 | #TOC>   1        Packages                         33 | ||||||
| #TOC>   2        CRAN packages                    98 | #TOC>   2        CRAN packages                    98 | ||||||
| #TOC>   3        Bioconductor packages           127 | #TOC>   3        Bioconductor packages           127 | ||||||
| #TOC>   4        Other package sources           142 | #TOC>   4        Other package sources           142 | ||||||
| #TOC>   5        Updating packages               148 | #TOC>   5        Updating packages               148 | ||||||
| #TOC> | #TOC> | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Packages  ============================================================ | # =    1  Packages  ============================================================ | ||||||
|  |  | ||||||
| # Much of R's functionality is contributed in packages: bundles of R scripts | # Much of R's functionality is contributed in packages: bundles of R scripts | ||||||
| # or code in other languages, pre-configured objects, and datasets. Making this | # or code in other languages, pre-configured objects, and datasets. Making this | ||||||
| # functionality available is often done by issuing a library(<package-name>) | # functionality available is often done by issuing a library(<package-name>) | ||||||
| # command, however this is not the preferred way, since it may override other | # command, however this is not the preferred way, since it may override other | ||||||
| # R functions and it makes it harder to understand where the source code of | # R functions and it makes it harder to understand where the source code of | ||||||
| # a particular function is located. In this course we call the function name | # a particular function is located. In this course we call the function name | ||||||
| # prefixed with the package name and two colons: | # prefixed with the package name and two colons: | ||||||
| #   <package-name>::<function-name>() | #   <package-name>::<function-name>() | ||||||
| # This is the preferred way, since it is explicit. | # This is the preferred way, since it is explicit. | ||||||
| # | # | ||||||
| # Regardless of which idiom one uses to call the actual function, the package | # Regardless of which idiom one uses to call the actual function, the package | ||||||
| #  needs to be "installed" first, i.e. the code must have been downloaded | #  needs to be "installed" first, i.e. the code must have been downloaded | ||||||
| # from CRAN, or using the BiocManager::install() function. | # from CRAN, or using the BiocManager::install() function. | ||||||
| # | # | ||||||
| # This script contains download commands for all packages that are used in the | # This script contains download commands for all packages that are used in the | ||||||
| # course. You can execute the script line by line (or even source the entire | # course. You can execute the script line by line (or even source the entire | ||||||
| # script) to make sure all packages can be installed on your computer. Just | # script) to make sure all packages can be installed on your computer. Just | ||||||
| # one reminder: if you are ever asked to install from source, the correct | # one reminder: if you are ever asked to install from source, the correct | ||||||
| # answer is usually "no" - except if you really know what you are doing and why. | # answer is usually "no" - except if you really know what you are doing and why. | ||||||
| # | # | ||||||
| # Once packages are installed you can get additional information about | # Once packages are installed you can get additional information about | ||||||
| # the contents of a package with the commands: | # the contents of a package with the commands: | ||||||
| #  library(help=<package-name>)       # basic information | #  library(help=<package-name>)       # basic information | ||||||
| #  browseVignettes("<package-name>")  # available vignettes | #  browseVignettes("<package-name>")  # available vignettes | ||||||
| #  data(package = "<package-name>")   # available datasets | #  data(package = "<package-name>")   # available datasets | ||||||
| # | # | ||||||
| #  ... and you can load data sets with: | #  ... and you can load data sets with: | ||||||
| #  data(<data-set-name>, package = "<package-name>") | #  data(<data-set-name>, package = "<package-name>") | ||||||
| # | # | ||||||
| #  All packages here are installed only when they have not been installed | #  All packages here are installed only when they have not been installed | ||||||
| #  before, using the following idiom: | #  before, using the following idiom: | ||||||
| # | # | ||||||
| #     if (! requireNamespace("<package-name>", quietly=TRUE)) { | #     if (! requireNamespace("<package-name>", quietly=TRUE)) { | ||||||
| #       install.packages("<package-name>") | #       install.packages("<package-name>") | ||||||
| #     } | #     } | ||||||
| # | # | ||||||
| #  ... or its BiocManager::install() equivalent: | #  ... or its BiocManager::install() equivalent: | ||||||
| # | # | ||||||
| # if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) { | # if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) { | ||||||
| #   BiocManager::install("<bioconductor-package-name>") | #   BiocManager::install("<bioconductor-package-name>") | ||||||
| # } | # } | ||||||
| # | # | ||||||
| #  If you want to _force_ a re-installation of the package, simply issue | #  If you want to _force_ a re-installation of the package, simply issue | ||||||
| #  the install.packages("<package-name>") command on its own. For compactness | #  the install.packages("<package-name>") command on its own. For compactness | ||||||
| #  we wrap the idiom into a function, which can also switch between CRAN | #  we wrap the idiom into a function, which can also switch between CRAN | ||||||
| #  and BIOconductor sources: | #  and BIOconductor sources: | ||||||
|  |  | ||||||
| installIfNeeded <- function(package, s = "CRAN") { | installIfNeeded <- function(package, s = "CRAN") { | ||||||
|   # s: "CRAN" or "BIO" |   # s: "CRAN" or "BIO" | ||||||
|   if (s == "CRAN") { |   if (s == "CRAN") { | ||||||
|     if (! requireNamespace(package, quietly=TRUE)) { |     if (! requireNamespace(package, quietly=TRUE)) { | ||||||
|       install.packages(package) |       install.packages(package) | ||||||
|     } |     } | ||||||
|   } else if (s == "BIO") { |   } else if (s == "BIO") { | ||||||
|     if (! requireNamespace("BiocManager", quietly=TRUE)) { |     if (! requireNamespace("BiocManager", quietly=TRUE)) { | ||||||
|       install.packages("BiocManager") |       install.packages("BiocManager") | ||||||
|     } |     } | ||||||
|     if (! requireNamespace(package, quietly=TRUE)) { |     if (! requireNamespace(package, quietly=TRUE)) { | ||||||
|       BiocManager::install(package) |       BiocManager::install(package) | ||||||
|     } |     } | ||||||
|   } else { |   } else { | ||||||
|     stop(sprintf("Unknown source \"%s\".", s)) |     stop(sprintf("Unknown source \"%s\".", s)) | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  CRAN packages  ======================================================= | # =    2  CRAN packages  ======================================================= | ||||||
|  |  | ||||||
| installIfNeeded("ape") | installIfNeeded("ape") | ||||||
| installIfNeeded("BiocManager") | installIfNeeded("BiocManager") | ||||||
| installIfNeeded("bio3d") | installIfNeeded("bio3d") | ||||||
| installIfNeeded("evd") | installIfNeeded("evd") | ||||||
| installIfNeeded("ggseqlogo") | installIfNeeded("ggseqlogo") | ||||||
| installIfNeeded("ggtern") | installIfNeeded("ggtern") | ||||||
| installIfNeeded("hexbin") | installIfNeeded("hexbin") | ||||||
| installIfNeeded("httr") | installIfNeeded("httr") | ||||||
| installIfNeeded("igraph") | installIfNeeded("igraph") | ||||||
| installIfNeeded("jsonlite") | installIfNeeded("jsonlite") | ||||||
| installIfNeeded("magrittr") | installIfNeeded("magrittr") | ||||||
| installIfNeeded("MASS") | installIfNeeded("MASS") | ||||||
| installIfNeeded("microbenchmark") | installIfNeeded("microbenchmark") | ||||||
| installIfNeeded("phangorn") | installIfNeeded("phangorn") | ||||||
| installIfNeeded("plotly") | installIfNeeded("plotly") | ||||||
| installIfNeeded("plotrix") | installIfNeeded("plotrix") | ||||||
| installIfNeeded("profvis") | installIfNeeded("profvis") | ||||||
| installIfNeeded("robustbase") | installIfNeeded("robustbase") | ||||||
| installIfNeeded("RColorBrewer") | installIfNeeded("RColorBrewer") | ||||||
| installIfNeeded("Rphylip") | installIfNeeded("Rphylip") | ||||||
| installIfNeeded("rvest") | installIfNeeded("rvest") | ||||||
| installIfNeeded("seqinr") | installIfNeeded("seqinr") | ||||||
| installIfNeeded("stringi") | installIfNeeded("stringi") | ||||||
| installIfNeeded("taxize") | installIfNeeded("taxize") | ||||||
| installIfNeeded("testthat") | installIfNeeded("testthat") | ||||||
| installIfNeeded("xml2") | installIfNeeded("xml2") | ||||||
|  |  | ||||||
| # =    3  Bioconductor packages  =============================================== | # =    3  Bioconductor packages  =============================================== | ||||||
|  |  | ||||||
| installIfNeeded("Biobase",       s = "BIO") | installIfNeeded("Biobase",       s = "BIO") | ||||||
| installIfNeeded("biomaRt",       s = "BIO") | installIfNeeded("biomaRt",       s = "BIO") | ||||||
| installIfNeeded("Biostrings",    s = "BIO") | installIfNeeded("Biostrings",    s = "BIO") | ||||||
| installIfNeeded("DECIPHER",      s = "BIO") | installIfNeeded("DECIPHER",      s = "BIO") | ||||||
| installIfNeeded("GEOquery",      s = "BIO") | installIfNeeded("GEOquery",      s = "BIO") | ||||||
| installIfNeeded("GOSim",         s = "BIO") | installIfNeeded("GOSim",         s = "BIO") | ||||||
| installIfNeeded("limma",         s = "BIO") | installIfNeeded("limma",         s = "BIO") | ||||||
| installIfNeeded("msa",           s = "BIO") | installIfNeeded("msa",           s = "BIO") | ||||||
| installIfNeeded("org.Sc.sgd.db", s = "BIO") | installIfNeeded("org.Sc.sgd.db", s = "BIO") | ||||||
| installIfNeeded("prada",         s = "BIO") | installIfNeeded("prada",         s = "BIO") | ||||||
| installIfNeeded("topGO",         s = "BIO") | installIfNeeded("topGO",         s = "BIO") | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Other package sources  =============================================== | # =    4  Other package sources  =============================================== | ||||||
|  |  | ||||||
| # Using sources other than CRAN or Bioconductor to download general-purpose | # Using sources other than CRAN or Bioconductor to download general-purpose | ||||||
| # programs that run on your computer is not generally recommended. | # programs that run on your computer is not generally recommended. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    5  Updating packages  =================================================== | # =    5  Updating packages  =================================================== | ||||||
|  |  | ||||||
| # From time to time, update CRAN packages with the following command ... | # From time to time, update CRAN packages with the following command ... | ||||||
|  |  | ||||||
| update.packages() | update.packages() | ||||||
|  |  | ||||||
| # ... and also update Bioconductor packages as follows: | # ... and also update Bioconductor packages as follows: | ||||||
|  |  | ||||||
| BiocManager::install() | BiocManager::install() | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,100 +1,100 @@ | |||||||
| # addSACCE_APSESproteins.R | # addSACCE_APSESproteins.R | ||||||
| # Adds the Saccharomyces cerevisiae APSES proteins to myDB | # Adds the Saccharomyces cerevisiae APSES proteins to myDB | ||||||
| # | # | ||||||
|  |  | ||||||
| myDB$protein <- | myDB$protein <- | ||||||
|     rbind(myDB$protein, |     rbind(myDB$protein, | ||||||
|           data.frame( |           data.frame( | ||||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), |               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||||
|               name = "SWI4_SACCE", |               name = "SWI4_SACCE", | ||||||
|               RefSeqID = "NP_011036", |               RefSeqID = "NP_011036", | ||||||
|               UniProtID = "P25302", |               UniProtID = "P25302", | ||||||
|               taxonomy.ID = as.integer(4932), |               taxonomy.ID = as.integer(4932), | ||||||
|               sequence = dbSanitizeSequence(" |               sequence = dbSanitizeSequence(" | ||||||
|         1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv |         1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv | ||||||
|        61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf |        61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf | ||||||
|        121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss |        121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss | ||||||
|        181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn |        181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn | ||||||
|        241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln |        241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln | ||||||
|        301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs |        301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs | ||||||
|        361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd |        361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd | ||||||
|        421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey |        421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey | ||||||
|        481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml |        481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml | ||||||
|        541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie |        541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie | ||||||
|        601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr |        601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr | ||||||
|        661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq |        661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq | ||||||
|        721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl |        721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl | ||||||
|        781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns |        781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns | ||||||
|        841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve |        841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve | ||||||
|        901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl |        901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl | ||||||
|        961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt |        961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt | ||||||
|        1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids |        1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids | ||||||
|        1081 klddiekdlr ana"), |        1081 klddiekdlr ana"), | ||||||
|               stringsAsFactors = FALSE)) |               stringsAsFactors = FALSE)) | ||||||
|  |  | ||||||
| myDB$protein <- | myDB$protein <- | ||||||
|     rbind(myDB$protein, |     rbind(myDB$protein, | ||||||
|           data.frame( |           data.frame( | ||||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), |               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||||
|               name = "PHD1_SACCE", |               name = "PHD1_SACCE", | ||||||
|               RefSeqID = "NP_012881", |               RefSeqID = "NP_012881", | ||||||
|               UniProtID = "P36093", |               UniProtID = "P36093", | ||||||
|               taxonomy.ID = as.integer(4932), |               taxonomy.ID = as.integer(4932), | ||||||
|               sequence = dbSanitizeSequence(" |               sequence = dbSanitizeSequence(" | ||||||
|         1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv |         1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv | ||||||
|        61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm |        61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm | ||||||
|       121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst |       121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst | ||||||
|       181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs |       181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs | ||||||
|       241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl |       241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl | ||||||
|       301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr |       301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr | ||||||
|       361 aknels"), |       361 aknels"), | ||||||
|               stringsAsFactors = FALSE)) |               stringsAsFactors = FALSE)) | ||||||
|  |  | ||||||
| myDB$protein <- | myDB$protein <- | ||||||
|     rbind(myDB$protein, |     rbind(myDB$protein, | ||||||
|           data.frame( |           data.frame( | ||||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), |               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||||
|               name = "SOK2_SACCE", |               name = "SOK2_SACCE", | ||||||
|               RefSeqID = "NP_013729", |               RefSeqID = "NP_013729", | ||||||
|               UniProtID = "P53438", |               UniProtID = "P53438", | ||||||
|               taxonomy.ID = as.integer(4932), |               taxonomy.ID = as.integer(4932), | ||||||
|               sequence = dbSanitizeSequence(" |               sequence = dbSanitizeSequence(" | ||||||
|         1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq |         1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq | ||||||
|        61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna |        61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna | ||||||
|       121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt |       121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt | ||||||
|       181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp |       181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp | ||||||
|       241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip |       241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip | ||||||
|       301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt |       301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt | ||||||
|       361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw |       361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw | ||||||
|       421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm |       421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm | ||||||
|       481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp |       481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp | ||||||
|       541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp |       541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp | ||||||
|       601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp |       601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp | ||||||
|       661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq |       661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq | ||||||
|       721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt |       721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt | ||||||
|       781 kkqek"), |       781 kkqek"), | ||||||
|               stringsAsFactors = FALSE)) |               stringsAsFactors = FALSE)) | ||||||
|  |  | ||||||
| myDB$protein <- | myDB$protein <- | ||||||
|     rbind(myDB$protein, |     rbind(myDB$protein, | ||||||
|           data.frame( |           data.frame( | ||||||
|               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), |               ID = dbAutoincrement(myDB$protein$ID, ns = "ref"), | ||||||
|               name = "XBP1_SACCE", |               name = "XBP1_SACCE", | ||||||
|               RefSeqID = "NP_012165", |               RefSeqID = "NP_012165", | ||||||
|               UniProtID = "P40489", |               UniProtID = "P40489", | ||||||
|               taxonomy.ID = as.integer(4932), |               taxonomy.ID = as.integer(4932), | ||||||
|               sequence = dbSanitizeSequence(" |               sequence = dbSanitizeSequence(" | ||||||
|         1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf |         1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf | ||||||
|        61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal |        61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal | ||||||
|       121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh |       121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh | ||||||
|       181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae |       181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae | ||||||
|       241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann |       241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann | ||||||
|       301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl |       301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl | ||||||
|       361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa |       361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa | ||||||
|       421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd |       421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd | ||||||
|       481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr |       481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr | ||||||
|       541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh |       541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh | ||||||
|       601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"), |       601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"), | ||||||
|               stringsAsFactors = FALSE)) |               stringsAsFactors = FALSE)) | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										138
									
								
								ABC-units.R
									
									
									
									
									
								
							
							
						
						
									
										138
									
								
								ABC-units.R
									
									
									
									
									
								
							| @@ -1,69 +1,69 @@ | |||||||
| # ABC-units.R | # ABC-units.R | ||||||
| # | # | ||||||
| # Purpose: A Bioinformatics Course: R code for learning units | # Purpose: A Bioinformatics Course: R code for learning units | ||||||
| # | # | ||||||
| # Version: 4.0 | # Version: 4.0 | ||||||
| # | # | ||||||
| # Date:    2020  09  16 | # Date:    2020  09  16 | ||||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| # V 4.0    2020 version | # V 4.0    2020 version | ||||||
| # V 3.0    2019 version | # V 3.0    2019 version | ||||||
| # V 2.0    2018 version | # V 2.0    2018 version | ||||||
| # V 1.0    2017 version | # V 1.0    2017 version | ||||||
| # V 0.1    First code | # V 0.1    First code | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||||
| # | # | ||||||
| # The R-scripts and datasets in this project will be continuously updated, | # The R-scripts and datasets in this project will be continuously updated, | ||||||
| # and updates will be posted on GitHub. To bring your version into the latest | # and updates will be posted on GitHub. To bring your version into the latest | ||||||
| # state use the Git-pane (top left) and "pull" (blue downward arrow) from the | # state use the Git-pane (top left) and "pull" (blue downward arrow) from the | ||||||
| # repository. However, this will overwrite locally edited version of files. | # repository. However, this will overwrite locally edited version of files. | ||||||
|  |  | ||||||
| # To edit code and experiment with it, for example to add your own comments and | # To edit code and experiment with it, for example to add your own comments and | ||||||
| # examples, save your edited version into the "myScripts" folder. Otherwise you | # examples, save your edited version into the "myScripts" folder. Otherwise you | ||||||
| # may have problems with git when you update the project to a new version. It's | # may have problems with git when you update the project to a new version. It's | ||||||
| # good practice to change the filename, for example by prepending your initials. | # good practice to change the filename, for example by prepending your initials. | ||||||
| # This helps distinguish the files you are working with e.g. in a list of | # This helps distinguish the files you are working with e.g. in a list of | ||||||
| # recent files. For example if your name is Honjo Tasuku, your edited | # recent files. For example if your name is Honjo Tasuku, your edited | ||||||
| # BIN-Sequence.R might be named HT-BIN-Sequence.R | # BIN-Sequence.R might be named HT-BIN-Sequence.R | ||||||
|  |  | ||||||
| # If you pull from github and get the following type of error ... | # If you pull from github and get the following type of error ... | ||||||
| #     --------------- | #     --------------- | ||||||
| #     error: Your local changes to the following files would be | #     error: Your local changes to the following files would be | ||||||
| #     overwritten by merge | #     overwritten by merge | ||||||
| #     ... | #     ... | ||||||
| #     Please commit your changes or stash them before you can merge. | #     Please commit your changes or stash them before you can merge. | ||||||
| #     --------------- | #     --------------- | ||||||
| # ... then, you need to bring the offending file into its original state. | # ... then, you need to bring the offending file into its original state. | ||||||
| # Open the Commit window, select the file, and click on the Revert button. | # Open the Commit window, select the file, and click on the Revert button. | ||||||
| # | # | ||||||
| # When working with these script DO NOT SIMPLY  source()  THESE FILES! | # When working with these script DO NOT SIMPLY  source()  THESE FILES! | ||||||
|  |  | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| #  going on. That's not how it works ... | #  going on. That's not how it works ... | ||||||
| # | # | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
| # Once you have typed and executed the function init(), you will find a file | # Once you have typed and executed the function init(), you will find a file | ||||||
| # called myScript.R in the project directory. | # called myScript.R in the project directory. | ||||||
| # | # | ||||||
| # Open it, you can place all of your code-experiments and notes into that | # Open it, you can place all of your code-experiments and notes into that | ||||||
| # file. This will complement your "Course Journal". If you keep all contents in | # file. This will complement your "Course Journal". If you keep all contents in | ||||||
| # this one file, you can find everything by using the <cmd>-F find function. To | # this one file, you can find everything by using the <cmd>-F find function. To | ||||||
| # cross-reference code in your journal, create section headings. | # cross-reference code in your journal, create section headings. | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
| # The individual learning units' files can be opened by simply clicking on them | # The individual learning units' files can be opened by simply clicking on them | ||||||
| # in the File pane. | # in the File pane. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,16 +1,16 @@ | |||||||
| Version: 1.0 | Version: 1.0 | ||||||
|  |  | ||||||
| RestoreWorkspace: No | RestoreWorkspace: No | ||||||
| SaveWorkspace: No | SaveWorkspace: No | ||||||
| AlwaysSaveHistory: No | AlwaysSaveHistory: No | ||||||
|  |  | ||||||
| EnableCodeIndexing: Yes | EnableCodeIndexing: Yes | ||||||
| UseSpacesForTab: Yes | UseSpacesForTab: Yes | ||||||
| NumSpacesForTab: 2 | NumSpacesForTab: 2 | ||||||
| Encoding: UTF-8 | Encoding: UTF-8 | ||||||
|  |  | ||||||
| RnwWeave: knitr | RnwWeave: knitr | ||||||
| LaTeX: XeLaTeX | LaTeX: XeLaTeX | ||||||
|  |  | ||||||
| AutoAppendNewline: Yes | AutoAppendNewline: Yes | ||||||
| StripTrailingWhitespace: Yes | StripTrailingWhitespace: Yes | ||||||
|   | |||||||
							
								
								
									
										222
									
								
								BIN-ALI-BLAST.R
									
									
									
									
									
								
							
							
						
						
									
										222
									
								
								BIN-ALI-BLAST.R
									
									
									
									
									
								
							| @@ -1,111 +1,111 @@ | |||||||
| # tocID <- "BIN-ALI-BLAST.R" | # tocID <- "BIN-ALI-BLAST.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-ALI-BLAST unit. | #              R code accompanying the BIN-ALI-BLAST unit. | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
| # | # | ||||||
| # Version:  1.3 | # Version:  1.3 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2020-09 | # Date:     2017-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.3    2020 Maintenance | #           1.3    2020 Maintenance | ||||||
| #           1.2    Change from require() to requireNamespace(), | #           1.2    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.1    Fixed parsing logic. | #           1.1    Fixed parsing logic. | ||||||
| #           1.0    First live version 2017. | #           1.0    First live version 2017. | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                               Line | #TOC>   Section  Title                               Line | ||||||
| #TOC> --------------------------------------------------- | #TOC> --------------------------------------------------- | ||||||
| #TOC>   1        Defining the APSES domain             45 | #TOC>   1        Defining the APSES domain             45 | ||||||
| #TOC>   2        Executing the BLAST search            75 | #TOC>   2        Executing the BLAST search            75 | ||||||
| #TOC>   3        Analysing results                     97 | #TOC>   3        Analysing results                     97 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Defining the APSES domain  =========================================== | # =    1  Defining the APSES domain  =========================================== | ||||||
|  |  | ||||||
| # Load your protein database | # Load your protein database | ||||||
| source("makeProteinDB.R") | source("makeProteinDB.R") | ||||||
|  |  | ||||||
| # Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You | # Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You | ||||||
| # have entered this data into your database in the | # have entered this data into your database in the | ||||||
| # BIN-ALI-Optimal_sequence_alignment unit.) | # BIN-ALI-Optimal_sequence_alignment unit.) | ||||||
|  |  | ||||||
| ( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct | ( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct | ||||||
|                                                 # name of the Mbp1 orthologue |                                                 # name of the Mbp1 orthologue | ||||||
|                                                 # of Mbp1 in your protein |                                                 # of Mbp1 in your protein | ||||||
|                                                 # database, DON'T continue. We |                                                 # database, DON'T continue. We | ||||||
|                                                 # need to fix this problem. |                                                 # need to fix this problem. | ||||||
|                                                 # Get in touch. |                                                 # Get in touch. | ||||||
|  |  | ||||||
| (proID <- myDB$protein$ID[myDB$protein$name == myOrth]) | (proID <- myDB$protein$ID[myDB$protein$name == myOrth]) | ||||||
| (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||||
| (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | ||||||
|                                myDB$annotation$featureID == ftrID]) |                                myDB$annotation$featureID == ftrID]) | ||||||
| (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | ||||||
| (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | ||||||
| (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | ||||||
|                  start, |                  start, | ||||||
|                  end)) |                  end)) | ||||||
|  |  | ||||||
| # The MYSPE "apses" sequence is the sequence that we will use for our reverse | # The MYSPE "apses" sequence is the sequence that we will use for our reverse | ||||||
| # BLAST search. | # BLAST search. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Executing the BLAST search  ========================================== | # =    2  Executing the BLAST search  ========================================== | ||||||
|  |  | ||||||
| # The ./scripts/BLAST.R code defines two functions to access the BLAST interface | # The ./scripts/BLAST.R code defines two functions to access the BLAST interface | ||||||
| # through its Web API, and to parse results. Have a look at the script, then | # through its Web API, and to parse results. Have a look at the script, then | ||||||
| # source it: | # source it: | ||||||
|  |  | ||||||
| source("./scripts/BLAST.R") | source("./scripts/BLAST.R") | ||||||
|  |  | ||||||
| # Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces | # Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces | ||||||
| # cerevisiae: | # cerevisiae: | ||||||
|  |  | ||||||
| BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence | BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence | ||||||
|                      db = "refseq_protein",        # database to search in |                      db = "refseq_protein",        # database to search in | ||||||
|                      nHits = 10,                   # |                      nHits = 10,                   # | ||||||
|                      E = 0.01,                     # |                      E = 0.01,                     # | ||||||
|                      limits = "txid559292[ORGN]")  # S. cerevisiae S288c |                      limits = "txid559292[ORGN]")  # S. cerevisiae S288c | ||||||
|  |  | ||||||
|  |  | ||||||
| length(BLASTresults$hits)  # There should be at least one hit there. Ask for | length(BLASTresults$hits)  # There should be at least one hit there. Ask for | ||||||
|                            # advice in case this step fails. |                            # advice in case this step fails. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Analysing results  =================================================== | # =    3  Analysing results  =================================================== | ||||||
|  |  | ||||||
| (topHit <- BLASTresults$hits[[1]])   # Get the top hit | (topHit <- BLASTresults$hits[[1]])   # Get the top hit | ||||||
|  |  | ||||||
| # What is the refseq ID of the top hit | # What is the refseq ID of the top hit | ||||||
| topHit$accession | topHit$accession | ||||||
|  |  | ||||||
| # If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses | # If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses | ||||||
| # domain. If it is not, ask me for advice. | # domain. If it is not, ask me for advice. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,195 +1,195 @@ | |||||||
| # tocID <- "BIN-ALI-Dotplot.R" | # tocID <- "BIN-ALI-Dotplot.R" | ||||||
| # | # | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-ALI-Dotplot unit. | #              R code accompanying the BIN-ALI-Dotplot unit. | ||||||
| # | # | ||||||
| # Version:  0.2 | # Version:  0.2 | ||||||
| # | # | ||||||
| # Date:     2019  01  07 | # Date:     2019  01  07 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           0.2    Change from require() to requireNamespace(), | #           0.2    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout | #                      use <package>::<function>() idiom throughout | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                  Line | #TOC>   Section  Title                  Line | ||||||
| #TOC> -------------------------------------- | #TOC> -------------------------------------- | ||||||
| #TOC>   1        ___Section___            42 | #TOC>   1        ___Section___            42 | ||||||
| #TOC>   2        Tasks                   190 | #TOC>   2        Tasks                   190 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  ___Section___  ======================================================= | # =    1  ___Section___  ======================================================= | ||||||
|  |  | ||||||
| if (!requireNamespace("BiocManager", quietly=TRUE)) { | if (!requireNamespace("BiocManager", quietly=TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
| if (!requireNamespace("Biostrings", quietly=TRUE)) { | if (!requireNamespace("Biostrings", quietly=TRUE)) { | ||||||
|   BiocManager::install("Biostrings") |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")    # available vignettes | #  browseVignettes("Biostrings")    # available vignettes | ||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
| if (!requireNamespace("seqinr", quietly=TRUE)) { | if (!requireNamespace("seqinr", quietly=TRUE)) { | ||||||
|   install.packages("seqinr") |   install.packages("seqinr") | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # Let's load BLOSUM62 | # Let's load BLOSUM62 | ||||||
| data(BLOSUM62, package = "Biostrings") | data(BLOSUM62, package = "Biostrings") | ||||||
|  |  | ||||||
| # Now let's craft code for a dotplot. That's surprisingly simple. We build a | # Now let's craft code for a dotplot. That's surprisingly simple. We build a | ||||||
| # matrix that has as many rows as one sequence, as many columns as another. Then | # matrix that has as many rows as one sequence, as many columns as another. Then | ||||||
| # we go through every cell of the matrix and enter the pairscore we encounter | # we go through every cell of the matrix and enter the pairscore we encounter | ||||||
| # for the amino acid pair whose position corresponds to the row and column | # for the amino acid pair whose position corresponds to the row and column | ||||||
| # index. Finally we visualize the matrix in a plot. | # index. Finally we visualize the matrix in a plot. | ||||||
| # | # | ||||||
|  |  | ||||||
| # First we fetch our sequences and split them into single characters. | # First we fetch our sequences and split them into single characters. | ||||||
| sel <- myDB$protein$name == "MBP1_SACCE" | sel <- myDB$protein$name == "MBP1_SACCE" | ||||||
| MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel]) | MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel]) | ||||||
|  |  | ||||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||||
| MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel]) | MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel]) | ||||||
|  |  | ||||||
| # Check that we have two character vectors of the expected length. | # Check that we have two character vectors of the expected length. | ||||||
| str(MBP1_SACCE) | str(MBP1_SACCE) | ||||||
| str(MBP1_MYSPE) | str(MBP1_MYSPE) | ||||||
|  |  | ||||||
| # How do we get the pairscore values? Consider: a single pair of amino acids can | # How do we get the pairscore values? Consider: a single pair of amino acids can | ||||||
| # be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ... | # be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ... | ||||||
| MBP1_SACCE[13] | MBP1_SACCE[13] | ||||||
| MBP1_MYSPE[21] | MBP1_MYSPE[21] | ||||||
|  |  | ||||||
| # ... using these as subsetting expressions, we can pull the pairscore | # ... using these as subsetting expressions, we can pull the pairscore | ||||||
| # from the MDM | # from the MDM | ||||||
| BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]] | BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]] | ||||||
|  |  | ||||||
| # First we build an empty matrix that will hold all pairscores ... | # First we build an empty matrix that will hold all pairscores ... | ||||||
| dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)), | dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)), | ||||||
|                  nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE)) |                  nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE)) | ||||||
|  |  | ||||||
| # ... then we loop over the sequences and store the scores in the matrix. | # ... then we loop over the sequences and store the scores in the matrix. | ||||||
| # | # | ||||||
| for (i in 1:length(MBP1_SACCE)) { | for (i in 1:length(MBP1_SACCE)) { | ||||||
|   for (j in 1:length(MBP1_MYSPE)) { |   for (j in 1:length(MBP1_MYSPE)) { | ||||||
|     dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]] |     dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]] | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| # Even though this is a large matrix, this does not take much time ... | # Even though this is a large matrix, this does not take much time ... | ||||||
| # Let's have a look at a small block of the values: | # Let's have a look at a small block of the values: | ||||||
|  |  | ||||||
| dotMat[1:10, 1:10] | dotMat[1:10, 1:10] | ||||||
|  |  | ||||||
| # Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in | # Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in | ||||||
| # the matrix correspond to an amino acid from MBP1_MYSPE. | # the matrix correspond to an amino acid from MBP1_MYSPE. | ||||||
|  |  | ||||||
| # To plot this, we use the image() function. Here, with default parameters. | # To plot this, we use the image() function. Here, with default parameters. | ||||||
|  |  | ||||||
| image(dotMat) | image(dotMat) | ||||||
|  |  | ||||||
| # Be patient, this takes a few moments to render: more than 500,000 values. | # Be patient, this takes a few moments to render: more than 500,000 values. | ||||||
| # Nice. | # Nice. | ||||||
| # What do you expect? | # What do you expect? | ||||||
| # What would similar sequences look like? | # What would similar sequences look like? | ||||||
| # What do you see? | # What do you see? | ||||||
|  |  | ||||||
| #You migh notice a thin line of yellow along the diagonal, moving approximately | #You migh notice a thin line of yellow along the diagonal, moving approximately | ||||||
| # from bottom left to top right, fading in and out of existence. This is the | # from bottom left to top right, fading in and out of existence. This is the | ||||||
| # signature of extended sequence similarity. | # signature of extended sequence similarity. | ||||||
|  |  | ||||||
| # Let's magnify this a bit by looking at only the first 200 amino acids ... | # Let's magnify this a bit by looking at only the first 200 amino acids ... | ||||||
| image(dotMat[1:200, 1:200]) | image(dotMat[1:200, 1:200]) | ||||||
|  |  | ||||||
| # ... and, according to our normal writing convention, we would like the | # ... and, according to our normal writing convention, we would like the | ||||||
| # diagonal to run from top-left to bottom-right since we write from left to | # diagonal to run from top-left to bottom-right since we write from left to | ||||||
| # right and from top to bottom... | # right and from top to bottom... | ||||||
| image(dotMat[1:200, 1:200], ylim = 1.0:0.0) | image(dotMat[1:200, 1:200], ylim = 1.0:0.0) | ||||||
|  |  | ||||||
| # ... and we would like the range of the x- and y- axis to correspond to the | # ... and we would like the range of the x- and y- axis to correspond to the | ||||||
| # sequence position ... | # sequence position ... | ||||||
| image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1)) | image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1)) | ||||||
|  |  | ||||||
| # ... and labels! Axis labels would be nice ... | # ... and labels! Axis labels would be nice ... | ||||||
| image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1), | image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1), | ||||||
|       xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" ) |       xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" ) | ||||||
|  |  | ||||||
| # ... and why don't we have axis-numbers on all four sides? Go, make that right | # ... and why don't we have axis-numbers on all four sides? Go, make that right | ||||||
| # too ... | # too ... | ||||||
| len <- 200 | len <- 200 | ||||||
| image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1), | image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1), | ||||||
|       xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE) |       xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE) | ||||||
| box() | box() | ||||||
| axis(1, at = c(1, seq(10, len, by=10))) | axis(1, at = c(1, seq(10, len, by=10))) | ||||||
| axis(2, at = c(1, seq(10, len, by=10))) | axis(2, at = c(1, seq(10, len, by=10))) | ||||||
| axis(3, at = c(1, seq(10, len, by=10))) | axis(3, at = c(1, seq(10, len, by=10))) | ||||||
| axis(4, at = c(1, seq(10, len, by=10))) | axis(4, at = c(1, seq(10, len, by=10))) | ||||||
|  |  | ||||||
| # ... you get the idea, we can infinitely customize our plot. However a good way | # ... you get the idea, we can infinitely customize our plot. However a good way | ||||||
| # to do this is to develop a particular view for, say, a report or publication | # to do this is to develop a particular view for, say, a report or publication | ||||||
| # in a script and then put it into a function. I have put a function into the | # in a script and then put it into a function. I have put a function into the | ||||||
| # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because | # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because | ||||||
| # there already is a dotplot function in the seqinr package: | # there already is a dotplot function in the seqinr package: | ||||||
|  |  | ||||||
| seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr | seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr | ||||||
| dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's | dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's | ||||||
|  |  | ||||||
| # Which one do you prefer? You can probably see the block patterns that arise | # Which one do you prefer? You can probably see the block patterns that arise | ||||||
| # from segments of repetitive, low complexity sequence. But you probably have to | # from segments of repetitive, low complexity sequence. But you probably have to | ||||||
| # look very closely to discern the faint diagonals that correspond to similar | # look very closely to discern the faint diagonals that correspond to similar | ||||||
| # sequence. | # sequence. | ||||||
|  |  | ||||||
|  |  | ||||||
| # Let's see if we can enhance the contrast between distributed noise and the | # Let's see if we can enhance the contrast between distributed noise and the | ||||||
| # actual alignment of conserved residues. We can filter the dot matrix with a | # actual alignment of conserved residues. We can filter the dot matrix with a | ||||||
| # pattern that enhances diagonally repeated values. Every value in the matrix | # pattern that enhances diagonally repeated values. Every value in the matrix | ||||||
| # will be replaced by a weighted average of its neighborhood. Here is  a | # will be replaced by a weighted average of its neighborhood. Here is  a | ||||||
| # diagonal-filter: | # diagonal-filter: | ||||||
|  |  | ||||||
| myFilter <- matrix(numeric(25), nrow = 5) | myFilter <- matrix(numeric(25), nrow = 5) | ||||||
| myFilter[1, ] <- c( 1, 0, 0, 0, 0) | myFilter[1, ] <- c( 1, 0, 0, 0, 0) | ||||||
| myFilter[2, ] <- c( 0, 1, 0, 0, 0) | myFilter[2, ] <- c( 0, 1, 0, 0, 0) | ||||||
| myFilter[3, ] <- c( 0, 0, 1, 0, 0) | myFilter[3, ] <- c( 0, 0, 1, 0, 0) | ||||||
| myFilter[4, ] <- c( 0, 0, 0, 1, 0) | myFilter[4, ] <- c( 0, 0, 0, 1, 0) | ||||||
| myFilter[5, ] <- c( 0, 0, 0, 0, 1) | myFilter[5, ] <- c( 0, 0, 0, 0, 1) | ||||||
|  |  | ||||||
| # I have added the option to read such filters (or others that you could define on your own) as a parameter of the function. | # I have added the option to read such filters (or others that you could define on your own) as a parameter of the function. | ||||||
|  |  | ||||||
| dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter) | dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter) | ||||||
|  |  | ||||||
| # I think the result shows quite nicely how the two sequences are globally | # I think the result shows quite nicely how the two sequences are globally | ||||||
| # related and where the regions of sequence similarity are. Play with this a bit | # related and where the regions of sequence similarity are. Play with this a bit | ||||||
| # ...  Can you come up with a better filter? If so, eMail us. | # ...  Can you come up with a better filter? If so, eMail us. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Tasks  =============================================================== | # =    2  Tasks  =============================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										1256
									
								
								BIN-ALI-MSA.R
									
									
									
									
									
								
							
							
						
						
									
										1256
									
								
								BIN-ALI-MSA.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,365 +1,365 @@ | |||||||
| # tocID <- "BIN-ALI-Optimal_sequence_alignment.R" | # tocID <- "BIN-ALI-Optimal_sequence_alignment.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit. | #              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit. | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
| # Version:  1.7.1 | # Version:  1.7.1 | ||||||
| # | # | ||||||
| # Date:     2017-09   -   2020-10 | # Date:     2017-09   -   2020-10 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/ | #           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/ | ||||||
| #           1.7    2020 updates | #           1.7    2020 updates | ||||||
| #           1.6    Maintenance | #           1.6    Maintenance | ||||||
| #           1.5    Change from require() to requireNamespace(), | #           1.5    Change from require() to requireNamespace(), | ||||||
| #                    use <package>::<function>() idiom throughout | #                    use <package>::<function>() idiom throughout | ||||||
| #           1.4    Pull s2c() from seqinr package, rather then loading the | #           1.4    Pull s2c() from seqinr package, rather then loading the | ||||||
| #                    entire library. | #                    entire library. | ||||||
| #           1.3    Updated confirmation task with correct logic | #           1.3    Updated confirmation task with correct logic | ||||||
| #           1.2    Added missing load of seqinr package | #           1.2    Added missing load of seqinr package | ||||||
| #           1.1    Update annotation file logic - it could already have been | #           1.1    Update annotation file logic - it could already have been | ||||||
| #                    prepared in the BIN-FUNC-Annotation unit. | #                    prepared in the BIN-FUNC-Annotation unit. | ||||||
| #           1.0.1  bugfix | #           1.0.1  bugfix | ||||||
| #           1.0    First 2017 live version. | #           1.0    First 2017 live version. | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                      Line | #TOC>   Section  Title                                                      Line | ||||||
| #TOC> -------------------------------------------------------------------------- | #TOC> -------------------------------------------------------------------------- | ||||||
| #TOC>   1        Prepare                                                      58 | #TOC>   1        Prepare                                                      58 | ||||||
| #TOC>   2        Biostrings Pairwise Alignment                                75 | #TOC>   2        Biostrings Pairwise Alignment                                75 | ||||||
| #TOC>   2.1        Optimal global alignment                                   93 | #TOC>   2.1        Optimal global alignment                                   93 | ||||||
| #TOC>   2.2        Optimal local alignment                                   156 | #TOC>   2.2        Optimal local alignment                                   156 | ||||||
| #TOC>   3        APSES Domain annotation by alignment                        180 | #TOC>   3        APSES Domain annotation by alignment                        180 | ||||||
| #TOC>   4        Update your database script                                 261 | #TOC>   4        Update your database script                                 261 | ||||||
| #TOC>   4.1        Preparing an annotation file ...                          267 | #TOC>   4.1        Preparing an annotation file ...                          267 | ||||||
| #TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269 | #TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269 | ||||||
| #TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314 | #TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314 | ||||||
| #TOC>   4.2        Execute and Validate                                      338 | #TOC>   4.2        Execute and Validate                                      338 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Prepare  ============================================================= | # =    1  Prepare  ============================================================= | ||||||
|  |  | ||||||
| if (! requireNamespace("seqinr", quietly=TRUE)) { | if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||||
|   install.packages("seqinr") |   install.packages("seqinr") | ||||||
| } | } | ||||||
| # You can get package information with the following commands: | # You can get package information with the following commands: | ||||||
| # library(help = seqinr)       # basic information | # library(help = seqinr)       # basic information | ||||||
| # browseVignettes("seqinr")    # available vignettes | # browseVignettes("seqinr")    # available vignettes | ||||||
| # data(package = "seqinr")     # available datasets | # data(package = "seqinr")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # You need to recreate the protein database that you have constructed in the | # You need to recreate the protein database that you have constructed in the | ||||||
| # BIN-Storing_data unit. | # BIN-Storing_data unit. | ||||||
|  |  | ||||||
| source("./myScripts/makeProteinDB.R") | source("./myScripts/makeProteinDB.R") | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Biostrings Pairwise Alignment  ======================================= | # =    2  Biostrings Pairwise Alignment  ======================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| if (!requireNamespace("BiocManager", quietly=TRUE)) { | if (!requireNamespace("BiocManager", quietly=TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
| if (!requireNamespace("Biostrings", quietly=TRUE)) { | if (!requireNamespace("Biostrings", quietly=TRUE)) { | ||||||
|   BiocManager::install("Biostrings") |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")    # available vignettes | #  browseVignettes("Biostrings")    # available vignettes | ||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # Biostrings stores sequences in "XString" objects. Once we have converted our | # Biostrings stores sequences in "XString" objects. Once we have converted our | ||||||
| # target sequences to AAString objects, the alignment itself is straightforward. | # target sequences to AAString objects, the alignment itself is straightforward. | ||||||
|  |  | ||||||
| # ==   2.1  Optimal global alignment  ========================================== | # ==   2.1  Optimal global alignment  ========================================== | ||||||
|  |  | ||||||
| # The pairwiseAlignment() function was written to behave | # The pairwiseAlignment() function was written to behave | ||||||
| # exactly like the functions you encountered on the EMBOSS server. | # exactly like the functions you encountered on the EMBOSS server. | ||||||
|  |  | ||||||
| # First: make AAString objects ... | # First: make AAString objects ... | ||||||
| sel <- myDB$protein$name == "MBP1_SACCE" | sel <- myDB$protein$name == "MBP1_SACCE" | ||||||
| aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel]) | aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel]) | ||||||
|  |  | ||||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||||
| aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel]) | aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel]) | ||||||
|  |  | ||||||
| ?pairwiseAlignment | ?pairwiseAlignment | ||||||
| # ... and align. | # ... and align. | ||||||
| # Global optimal alignment with end-gap penalties is default. | # Global optimal alignment with end-gap penalties is default. | ||||||
| ali1 <-  Biostrings::pairwiseAlignment( | ali1 <-  Biostrings::pairwiseAlignment( | ||||||
|   aaMBP1_SACCE, |   aaMBP1_SACCE, | ||||||
|   aaMBP1_MYSPE, |   aaMBP1_MYSPE, | ||||||
|   substitutionMatrix = "BLOSUM62", |   substitutionMatrix = "BLOSUM62", | ||||||
|   gapOpening = 10, |   gapOpening = 10, | ||||||
|   gapExtension = 0.5) |   gapExtension = 0.5) | ||||||
|  |  | ||||||
| str(ali1)  # ... it's complicated | str(ali1)  # ... it's complicated | ||||||
|  |  | ||||||
| # This is a Biostrings alignment object. But we can use Biostrings functions to | # This is a Biostrings alignment object. But we can use Biostrings functions to | ||||||
| # tame it: | # tame it: | ||||||
| ali1 | ali1 | ||||||
| Biostrings::writePairwiseAlignments(ali1)   # That should look familiar | Biostrings::writePairwiseAlignments(ali1)   # That should look familiar | ||||||
|  |  | ||||||
| # And we can make the internal structure work for us  (@ is for classes as | # And we can make the internal structure work for us  (@ is for classes as | ||||||
| # $ is for lists ...) | # $ is for lists ...) | ||||||
| str(ali1@pattern) | str(ali1@pattern) | ||||||
| ali1@pattern | ali1@pattern | ||||||
| ali1@pattern@range | ali1@pattern@range | ||||||
| ali1@pattern@indel | ali1@pattern@indel | ||||||
| ali1@pattern@mismatch | ali1@pattern@mismatch | ||||||
|  |  | ||||||
| # or work with "normal" R functions | # or work with "normal" R functions | ||||||
| # the alignment length | # the alignment length | ||||||
| nchar(as.character(ali1@pattern)) | nchar(as.character(ali1@pattern)) | ||||||
|  |  | ||||||
| # the number of identities | # the number of identities | ||||||
| sum(seqinr::s2c(as.character(ali1@pattern)) == | sum(seqinr::s2c(as.character(ali1@pattern)) == | ||||||
|     seqinr::s2c(as.character(ali1@subject))) |     seqinr::s2c(as.character(ali1@subject))) | ||||||
|  |  | ||||||
| # ... e.g. to calculate the percentage of identities | # ... e.g. to calculate the percentage of identities | ||||||
| 100 * | 100 * | ||||||
|   sum(seqinr::s2c(as.character(ali1@pattern)) == |   sum(seqinr::s2c(as.character(ali1@pattern)) == | ||||||
|       seqinr::s2c(as.character(ali1@subject))) / |       seqinr::s2c(as.character(ali1@subject))) / | ||||||
|   nchar(as.character(ali1@pattern)) |   nchar(as.character(ali1@pattern)) | ||||||
| # ... which should be the same as reported in the writePairwiseAlignments() | # ... which should be the same as reported in the writePairwiseAlignments() | ||||||
| # output. Awkward to type? Then it calls for a function: | # output. Awkward to type? Then it calls for a function: | ||||||
| # | # | ||||||
| percentID <- function(al) { | percentID <- function(al) { | ||||||
|   # returns the percent-identity of a Biostrings alignment object |   # returns the percent-identity of a Biostrings alignment object | ||||||
|   return(100 * |   return(100 * | ||||||
|          sum(seqinr::s2c(as.character(al@pattern)) == |          sum(seqinr::s2c(as.character(al@pattern)) == | ||||||
|              seqinr::s2c(as.character(al@subject))) / |              seqinr::s2c(as.character(al@subject))) / | ||||||
|          nchar(as.character(al@pattern))) |          nchar(as.character(al@pattern))) | ||||||
| } | } | ||||||
|  |  | ||||||
| percentID(ali1) | percentID(ali1) | ||||||
|  |  | ||||||
| # ==   2.2  Optimal local alignment  =========================================== | # ==   2.2  Optimal local alignment  =========================================== | ||||||
|  |  | ||||||
| # Compare with local optimal alignment (like EMBOSS Water) | # Compare with local optimal alignment (like EMBOSS Water) | ||||||
| ali2 <-  Biostrings::pairwiseAlignment( | ali2 <-  Biostrings::pairwiseAlignment( | ||||||
|   aaMBP1_SACCE, |   aaMBP1_SACCE, | ||||||
|   aaMBP1_MYSPE, |   aaMBP1_MYSPE, | ||||||
|   type = "local", |   type = "local", | ||||||
|   substitutionMatrix = "BLOSUM62", |   substitutionMatrix = "BLOSUM62", | ||||||
|   gapOpening = 50, |   gapOpening = 50, | ||||||
|   gapExtension = 10) |   gapExtension = 10) | ||||||
|  |  | ||||||
| Biostrings::writePairwiseAlignments(ali2) | Biostrings::writePairwiseAlignments(ali2) | ||||||
| # This has probably only aligned the N-terminal DNA binding domain - but that | # This has probably only aligned the N-terminal DNA binding domain - but that | ||||||
| # one has quite high sequence identity: | # one has quite high sequence identity: | ||||||
| percentID(ali2) | percentID(ali2) | ||||||
|  |  | ||||||
| # == TASK: == | # == TASK: == | ||||||
|  |  | ||||||
| # Compare the two alignments. I have weighted the local alignment heavily | # Compare the two alignments. I have weighted the local alignment heavily | ||||||
| # towards an ungapped alignment by setting very high gap penalties. Try changing | # towards an ungapped alignment by setting very high gap penalties. Try changing | ||||||
| # the gap penalties and see what happens: how does the number of indels change, | # the gap penalties and see what happens: how does the number of indels change, | ||||||
| # how does the length of indels change... | # how does the length of indels change... | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  APSES Domain annotation by alignment  ================================ | # =    3  APSES Domain annotation by alignment  ================================ | ||||||
|  |  | ||||||
| # In this section we define the MYSPE APSES sequence by performing a global, | # In this section we define the MYSPE APSES sequence by performing a global, | ||||||
| # optimal sequence alignment of the yeast APSES domain with the full length | # optimal sequence alignment of the yeast APSES domain with the full length | ||||||
| # protein sequence of the protein that was the most similar to the yeast APSES | # protein sequence of the protein that was the most similar to the yeast APSES | ||||||
| # domain. | # domain. | ||||||
| # | # | ||||||
|  |  | ||||||
| # I have annotated the yeast APSES domain as a feature in the | # I have annotated the yeast APSES domain as a feature in the | ||||||
| # database. To view the annotation, we can retrieve it via the proteinID and | # database. To view the annotation, we can retrieve it via the proteinID and | ||||||
| # featureID. Here is the yeast protein ID: | # featureID. Here is the yeast protein ID: | ||||||
| (proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"]) | (proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"]) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ... and if you look at the feature table, you can identify the feature ID | # ... and if you look at the feature table, you can identify the feature ID | ||||||
| (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||||
|  |  | ||||||
| # ... and with the two annotations we can get the corresponding ID from the | # ... and with the two annotations we can get the corresponding ID from the | ||||||
| # annotation table | # annotation table | ||||||
| (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | ||||||
|                              myDB$annotation$featureID == ftrID]) |                              myDB$annotation$featureID == ftrID]) | ||||||
|  |  | ||||||
| myDB$annotation[myDB$annotation$ID == proID & | myDB$annotation[myDB$annotation$ID == proID & | ||||||
|                 myDB$annotation$ID == ftrID, ] |                 myDB$annotation$ID == ftrID, ] | ||||||
|  |  | ||||||
| # The annotation record contains the start and end coordinates which we can use | # The annotation record contains the start and end coordinates which we can use | ||||||
| # to define the APSES domain sequence with a substr() expression. | # to define the APSES domain sequence with a substr() expression. | ||||||
|  |  | ||||||
| (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | ||||||
| (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | ||||||
| (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | ||||||
|                  start, |                  start, | ||||||
|                  end)) |                  end)) | ||||||
|  |  | ||||||
| # Lots of code. But don't get lost. Let's recapitulate what we have done: we | # Lots of code. But don't get lost. Let's recapitulate what we have done: we | ||||||
| # have selected from the sequence column of the protein table the sequence whose | # have selected from the sequence column of the protein table the sequence whose | ||||||
| # name is "MBP1_SACCE", and selected from the annotation table the start | # name is "MBP1_SACCE", and selected from the annotation table the start | ||||||
| # and end coordinates of the annotation that joins an "APSES fold" feature with | # and end coordinates of the annotation that joins an "APSES fold" feature with | ||||||
| # the sequence, and used the start and end coordinates to extract a substring. | # the sequence, and used the start and end coordinates to extract a substring. | ||||||
|  |  | ||||||
| # Let's convert this to an AAstring and assign it: | # Let's convert this to an AAstring and assign it: | ||||||
| aaMB1_SACCE_APSES <- Biostrings::AAString(apses) | aaMB1_SACCE_APSES <- Biostrings::AAString(apses) | ||||||
|  |  | ||||||
| # Now let's align these two sequences of very different length without end-gap | # Now let's align these two sequences of very different length without end-gap | ||||||
| # penalties using the "overlap" type. "overlap" turns the | # penalties using the "overlap" type. "overlap" turns the | ||||||
| # end-gap penalties off and that is crucially important since | # end-gap penalties off and that is crucially important since | ||||||
| # the sequences have very different length. | # the sequences have very different length. | ||||||
|  |  | ||||||
| aliApses <-  Biostrings::pairwiseAlignment( | aliApses <-  Biostrings::pairwiseAlignment( | ||||||
|   aaMB1_SACCE_APSES, |   aaMB1_SACCE_APSES, | ||||||
|   aaMBP1_MYSPE, |   aaMBP1_MYSPE, | ||||||
|   type = "overlap", |   type = "overlap", | ||||||
|   substitutionMatrix = "BLOSUM62", |   substitutionMatrix = "BLOSUM62", | ||||||
|   gapOpening = 10, |   gapOpening = 10, | ||||||
|   gapExtension = 0.5) |   gapExtension = 0.5) | ||||||
|  |  | ||||||
| # Inspect the result. The aligned sequences should be clearly | # Inspect the result. The aligned sequences should be clearly | ||||||
| # homologous, and have (almost) no indels. The entire "pattern" | # homologous, and have (almost) no indels. The entire "pattern" | ||||||
| # sequence from QIYSAR ... to ... KPLFDF  should be matched | # sequence from QIYSAR ... to ... KPLFDF  should be matched | ||||||
| # with the "query". Is this correct? | # with the "query". Is this correct? | ||||||
| Biostrings::writePairwiseAlignments(aliApses) | Biostrings::writePairwiseAlignments(aliApses) | ||||||
|  |  | ||||||
| # If this is correct, you can extract the matched sequence from | # If this is correct, you can extract the matched sequence from | ||||||
| # the alignment object. The syntax is a bit different from what | # the alignment object. The syntax is a bit different from what | ||||||
| # you have seen before: this is an "S4 object", not a list. No | # you have seen before: this is an "S4 object", not a list. No | ||||||
| # worries: as.character() returns a normal string. | # worries: as.character() returns a normal string. | ||||||
| as.character(aliApses@subject) | as.character(aliApses@subject) | ||||||
|  |  | ||||||
| # Now, what are the aligned start and end coordinates? You can read them from | # Now, what are the aligned start and end coordinates? You can read them from | ||||||
| # the output of writePairwiseAlignments(), or you can get them from the range of | # the output of writePairwiseAlignments(), or you can get them from the range of | ||||||
| # the match. | # the match. | ||||||
|  |  | ||||||
| str(aliApses@subject@range) | str(aliApses@subject@range) | ||||||
|  |  | ||||||
| # start is: | # start is: | ||||||
| aliApses@subject@range@start | aliApses@subject@range@start | ||||||
|  |  | ||||||
| # ... and end is: | # ... and end is: | ||||||
| aliApses@subject@range@start + aliApses@subject@range@width - 1 | aliApses@subject@range@start + aliApses@subject@range@width - 1 | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Update your database script  ========================================= | # =    4  Update your database script  ========================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| # Since we have this feature defined now, we can create a feature annotation | # Since we have this feature defined now, we can create a feature annotation | ||||||
| # right away and store it in myDB. | # right away and store it in myDB. | ||||||
|  |  | ||||||
| # ==   4.1  Preparing an annotation file ...  ================================== | # ==   4.1  Preparing an annotation file ...  ================================== | ||||||
| # | # | ||||||
| # ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit | # ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit | ||||||
| # | # | ||||||
| # | # | ||||||
| #   You DON'T already have a file called "<MYSPE>-Annotations.json" in the | #   You DON'T already have a file called "<MYSPE>-Annotations.json" in the | ||||||
| #   ./myScripts/ directory: | #   ./myScripts/ directory: | ||||||
| # | # | ||||||
| #   - Make a copy of the file "./data/refAnnotations.json" and put it in your | #   - Make a copy of the file "./data/refAnnotations.json" and put it in your | ||||||
| #     myScripts/ directory. | #     myScripts/ directory. | ||||||
| # | # | ||||||
| #   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. | #   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. | ||||||
| #     if MYSPE is called "Crptycoccus neoformans", your file should be called | #     if MYSPE is called "Crptycoccus neoformans", your file should be called | ||||||
| #     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is | #     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is | ||||||
| #     "MBP1_CRYNE"). | #     "MBP1_CRYNE"). | ||||||
| # | # | ||||||
| #   - Open the file in the RStudio editor and delete all blocks for | #   - Open the file in the RStudio editor and delete all blocks for | ||||||
| #     the Mbp1 protein annotations except the first one. | #     the Mbp1 protein annotations except the first one. | ||||||
| # | # | ||||||
| #   - From that block, delete all lines except for the line that says: | #   - From that block, delete all lines except for the line that says: | ||||||
| # | # | ||||||
| # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, | # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, | ||||||
| # | # | ||||||
| #   - Then delete the comma at the end of the line (your file will just have | #   - Then delete the comma at the end of the line (your file will just have | ||||||
| #     this one annotation). | #     this one annotation). | ||||||
| # | # | ||||||
| #   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the | #   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the | ||||||
| #     "start" and "end" features to the coordinates you just discovered for the | #     "start" and "end" features to the coordinates you just discovered for the | ||||||
| #     APSES domain in your sequence. | #     APSES domain in your sequence. | ||||||
| # | # | ||||||
| #   - Save the file in your myScripts/ directory | #   - Save the file in your myScripts/ directory | ||||||
| # | # | ||||||
| ##   - Validate your file online at https://jsonlint.com/ | ##   - Validate your file online at https://jsonlint.com/ | ||||||
| # | # | ||||||
| #   - Update your "./myScripts/makeProteinDB.R" script to load your new | #   - Update your "./myScripts/makeProteinDB.R" script to load your new | ||||||
| #     annotation when you recreate the database. Open the script in the | #     annotation when you recreate the database. Open the script in the | ||||||
| #     RStudio editor, and add the following command at the end: | #     RStudio editor, and add the following command at the end: | ||||||
| # | # | ||||||
| #     myDB <- dbAddAnnotation(myDB, | #     myDB <- dbAddAnnotation(myDB, | ||||||
| #                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) | #                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) | ||||||
| #                                                 ^^^^^^^ | #                                                 ^^^^^^^ | ||||||
| #                                                edit this! | #                                                edit this! | ||||||
| #   - save and close the file. | #   - save and close the file. | ||||||
| # | # | ||||||
| # Then SKIP the next section. | # Then SKIP the next section. | ||||||
| # | # | ||||||
| # | # | ||||||
| # ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit     | # ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit     | ||||||
| # | # | ||||||
| # | # | ||||||
| #   You DO already have a file called "<MYSPE>-Annotations.json" in the | #   You DO already have a file called "<MYSPE>-Annotations.json" in the | ||||||
| #   ./myScripts/ directory: | #   ./myScripts/ directory: | ||||||
| # | # | ||||||
| #   - Open the file in the RStudio editor. | #   - Open the file in the RStudio editor. | ||||||
| # | # | ||||||
| #   - Below the last feature lines (but before the closing "]") add the | #   - Below the last feature lines (but before the closing "]") add the | ||||||
| #     following feature line (without the "#") | #     following feature line (without the "#") | ||||||
| # | # | ||||||
| # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"} | # {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"} | ||||||
| # | # | ||||||
| #   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the | #   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the | ||||||
| #     "start" and "end" features to the coordinates you just discovered for the | #     "start" and "end" features to the coordinates you just discovered for the | ||||||
| #     APSES domain in your sequence. | #     APSES domain in your sequence. | ||||||
| # | # | ||||||
| #   - Add a comma after the preceding feature line. | #   - Add a comma after the preceding feature line. | ||||||
| # | # | ||||||
| #   - Save your file. | #   - Save your file. | ||||||
| # | # | ||||||
| #   - Validate your file online at https://jsonlint.com/ | #   - Validate your file online at https://jsonlint.com/ | ||||||
| # | # | ||||||
| # | # | ||||||
| # ==   4.2  Execute and Validate  ============================================== | # ==   4.2  Execute and Validate  ============================================== | ||||||
| # | # | ||||||
| #   - source() your database creation script: | #   - source() your database creation script: | ||||||
| # | # | ||||||
| #  source("./myScripts/makeProteinDB.R") | #  source("./myScripts/makeProteinDB.R") | ||||||
| # | # | ||||||
| #     This should run without errors or warnings. If it doesn't work and you | #     This should run without errors or warnings. If it doesn't work and you | ||||||
| #     can't figure out quickly what's happening, ask on the mailing list for | #     can't figure out quickly what's happening, ask on the mailing list for | ||||||
| #     help. | #     help. | ||||||
| # | # | ||||||
| #   - Confirm | #   - Confirm | ||||||
| #     The following commands should retrieve the correct start and end | #     The following commands should retrieve the correct start and end | ||||||
| #     coordinates and sequence of the MBP1_MYSPE APSES domain: | #     coordinates and sequence of the MBP1_MYSPE APSES domain: | ||||||
|  |  | ||||||
| sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")) | sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")) | ||||||
|  |  | ||||||
| (proID <- myDB$protein$ID[sel]) | (proID <- myDB$protein$ID[sel]) | ||||||
| (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | (ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||||
| (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | (fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID & | ||||||
|                              myDB$annotation$featureID == ftrID]) |                              myDB$annotation$featureID == ftrID]) | ||||||
| (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | (start <- myDB$annotation$start[myDB$annotation$ID == fanID]) | ||||||
| (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | (end   <- myDB$annotation$end[myDB$annotation$ID == fanID]) | ||||||
| (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | (apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID], | ||||||
|                  start, |                  start, | ||||||
|                  end)) |                  end)) | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,313 +1,313 @@ | |||||||
| # tocID <- "BIN-ALI-Similarity.R" | # tocID <- "BIN-ALI-Similarity.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-ALI-Similarity unit. | #              R code accompanying the BIN-ALI-Similarity unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2020-09 | # Date:     2017-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Updates | #           1.2    2020 Updates | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0    Refactored for 2017; add aaindex, ternary plot. | #           1.0    Refactored for 2017; add aaindex, ternary plot. | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| #   Update ggtern:: ternary plot to use aacol dots under text | #   Update ggtern:: ternary plot to use aacol dots under text | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                          Line | #TOC>   Section  Title                          Line | ||||||
| #TOC> ---------------------------------------------- | #TOC> ---------------------------------------------- | ||||||
| #TOC>   1        Amino Acid Properties            43 | #TOC>   1        Amino Acid Properties            43 | ||||||
| #TOC>   2        Mutation Data matrix            189 | #TOC>   2        Mutation Data matrix            189 | ||||||
| #TOC>   3        Background score                230 | #TOC>   3        Background score                230 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Amino Acid Properties  =============================================== | # =    1  Amino Acid Properties  =============================================== | ||||||
|  |  | ||||||
| # A large collection of amino acid property tables is available via the seqinr | # A large collection of amino acid property tables is available via the seqinr | ||||||
| # package: | # package: | ||||||
|  |  | ||||||
| if (! requireNamespace("seqinr", quietly=TRUE)) { | if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||||
|   install.packages("seqinr") |   install.packages("seqinr") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = seqinr)       # basic information | #  library(help = seqinr)       # basic information | ||||||
| #  browseVignettes("seqinr")    # available vignettes | #  browseVignettes("seqinr")    # available vignettes | ||||||
| #  data(package = "seqinr")     # available datasets | #  data(package = "seqinr")     # available datasets | ||||||
|  |  | ||||||
| # A true Labor of Love has gone into the compilation of the seqinr "aaindex" | # A true Labor of Love has gone into the compilation of the seqinr "aaindex" | ||||||
| #  data: | #  data: | ||||||
|  |  | ||||||
| ?aaindex | ?aaindex | ||||||
| data(aaindex, package = "seqinr")  # load the aaindex list from the package | data(aaindex, package = "seqinr")  # load the aaindex list from the package | ||||||
|  |  | ||||||
| length(aaindex) | length(aaindex) | ||||||
|  |  | ||||||
| # Here are all the index descriptions | # Here are all the index descriptions | ||||||
| for (i in 1:length(aaindex)) { | for (i in 1:length(aaindex)) { | ||||||
|   cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) |   cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) | ||||||
| } | } | ||||||
|  |  | ||||||
| # It's a bit cumbersome to search through the descriptions ... here is a | # It's a bit cumbersome to search through the descriptions ... here is a | ||||||
| # function to make this easier: | # function to make this easier: | ||||||
|  |  | ||||||
| searchAAindex <- function(patt) { | searchAAindex <- function(patt) { | ||||||
|   # Searches the aaindex descriptions for regular expression "patt" |   # Searches the aaindex descriptions for regular expression "patt" | ||||||
|   # and prints index number and description. |   # and prints index number and description. | ||||||
|   hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0)) |   hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0)) | ||||||
|   for (i in seq_along(hits)) { |   for (i in seq_along(hits)) { | ||||||
|     cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D)) |     cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D)) | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| searchAAindex("free energy")          # Search for "free energy" | searchAAindex("free energy")          # Search for "free energy" | ||||||
| searchAAindex("(size)|(volume)")      # Search for "size" or "volume": | searchAAindex("(size)|(volume)")      # Search for "size" or "volume": | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Let's examine ... | # Let's examine ... | ||||||
| # ... a hydrophobicity index | # ... a hydrophobicity index | ||||||
| (Y <- aaindex[[528]][c("D", "I")]) | (Y <- aaindex[[528]][c("D", "I")]) | ||||||
|  |  | ||||||
| # ... a volume index | # ... a volume index | ||||||
| (V <- aaindex[[150]][c("D", "I")]) | (V <- aaindex[[150]][c("D", "I")]) | ||||||
|  |  | ||||||
| # ... and one of our own: side-chain pK values as reported by | # ... and one of our own: side-chain pK values as reported by | ||||||
| # Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set | # Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set | ||||||
| # to 7.4 (physiological pH) | # to 7.4 (physiological pH) | ||||||
| K <- list(I = c( 7.4,   # Ala | K <- list(I = c( 7.4,   # Ala | ||||||
|                 12.3,   # Arg |                 12.3,   # Arg | ||||||
|                  7.4,   # Asn |                  7.4,   # Asn | ||||||
|                  3.9,   # Asp |                  3.9,   # Asp | ||||||
|                  8.6,   # Cys |                  8.6,   # Cys | ||||||
|                  7.4,   # Gln |                  7.4,   # Gln | ||||||
|                  4.3,   # Glu |                  4.3,   # Glu | ||||||
|                  7.4,   # Gly |                  7.4,   # Gly | ||||||
|                  6.5,   # His |                  6.5,   # His | ||||||
|                  7.4,   # Ile |                  7.4,   # Ile | ||||||
|                  7.4,   # Leu |                  7.4,   # Leu | ||||||
|                 10.4,   # Lys |                 10.4,   # Lys | ||||||
|                  7.4,   # Met |                  7.4,   # Met | ||||||
|                  7.4,   # Phe |                  7.4,   # Phe | ||||||
|                  7.4,   # Pro |                  7.4,   # Pro | ||||||
|                  7.4,   # Ser |                  7.4,   # Ser | ||||||
|                  7.4,   # Thr |                  7.4,   # Thr | ||||||
|                  7.4,   # Trp |                  7.4,   # Trp | ||||||
|                  9.8,   # Tyr |                  9.8,   # Tyr | ||||||
|                  7.4))  # Val |                  7.4))  # Val | ||||||
| names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile", | names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile", | ||||||
|                 "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val") |                 "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val") | ||||||
|  |  | ||||||
|  |  | ||||||
| # Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ... | # Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ... | ||||||
|  |  | ||||||
| # pull the names from Y$I, convert them to single letter code, and reorder the | # pull the names from Y$I, convert them to single letter code, and reorder the | ||||||
| # AACOLS palette accordingly ... | # AACOLS palette accordingly ... | ||||||
| aac <- AACOLS[toupper(seqinr::a(names(Y$I)))] | aac <- AACOLS[toupper(seqinr::a(names(Y$I)))] | ||||||
|  |  | ||||||
| plot(Y$I, V$I, | plot(Y$I, V$I, | ||||||
|      xlab = "hydrophobicity", ylab = "volume", |      xlab = "hydrophobicity", ylab = "volume", | ||||||
|      pch = 21, |      pch = 21, | ||||||
|      cex = 6, |      cex = 6, | ||||||
|      col = aac, |      col = aac, | ||||||
|      bg  = aac) |      bg  = aac) | ||||||
| text(Y$I, V$I, names(Y$I), cex = 0.8) | text(Y$I, V$I, names(Y$I), cex = 0.8) | ||||||
|  |  | ||||||
| plot(Y$I, K$I, | plot(Y$I, K$I, | ||||||
|      xlab = "hydrophobicity", ylab = "pK", |      xlab = "hydrophobicity", ylab = "pK", | ||||||
|      pch = 21, |      pch = 21, | ||||||
|      cex = 6, |      cex = 6, | ||||||
|      col = aac, |      col = aac, | ||||||
|      bg  = aac) |      bg  = aac) | ||||||
| text(Y$I, K$I, names(Y$I), cex = 0.8) | text(Y$I, K$I, names(Y$I), cex = 0.8) | ||||||
|  |  | ||||||
| # ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such | # ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such | ||||||
| # plots are in general unintuitive and hard to interpret. One alternative is a | # plots are in general unintuitive and hard to interpret. One alternative is a | ||||||
| # so-called "ternary plot": | # so-called "ternary plot": | ||||||
|  |  | ||||||
| if (! requireNamespace("ggtern", quietly=TRUE)) { | if (! requireNamespace("ggtern", quietly=TRUE)) { | ||||||
|   install.packages("ggtern") |   install.packages("ggtern") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = ggtern)       # basic information | #  library(help = ggtern)       # basic information | ||||||
| #  browseVignettes("ggtern")    # available vignettes | #  browseVignettes("ggtern")    # available vignettes | ||||||
| #  data(package = "ggtern")     # available datasets | #  data(package = "ggtern")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # collect into data frame, normalize to (0.05, 0.95) | # collect into data frame, normalize to (0.05, 0.95) | ||||||
| myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05, | myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05, | ||||||
|                     "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05, |                     "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05, | ||||||
|                     "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05, |                     "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05, | ||||||
|                     stringsAsFactors = FALSE) |                     stringsAsFactors = FALSE) | ||||||
| rownames(myDat) <- names(Y$I) | rownames(myDat) <- names(Y$I) | ||||||
|  |  | ||||||
| ggtern::ggtern(data = myDat, | ggtern::ggtern(data = myDat, | ||||||
|                ggplot2::aes(x = vol, |                ggplot2::aes(x = vol, | ||||||
|                    y = phi, |                    y = phi, | ||||||
|                    z = pK, |                    z = pK, | ||||||
|                    label = rownames(myDat))) + ggplot2::geom_text() |                    label = rownames(myDat))) + ggplot2::geom_text() | ||||||
|  |  | ||||||
| # This results in a mapping of amino acids relative to each other that is | # This results in a mapping of amino acids relative to each other that is | ||||||
| # similar to the Venn diagram you have seen in the notes. | # similar to the Venn diagram you have seen in the notes. | ||||||
|  |  | ||||||
| # ... or we could use principal components analysis, to pull out the | # ... or we could use principal components analysis, to pull out the | ||||||
| # best projection of the three feature dimensions into two. (Done here without delving | # best projection of the three feature dimensions into two. (Done here without delving | ||||||
| # into the theory ...) | # into the theory ...) | ||||||
| prc <- prcomp(myDat) | prc <- prcomp(myDat) | ||||||
| plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n", | plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n", | ||||||
|      pch=19, cex=6, col=aad, cex.main=0.7, |      pch=19, cex=6, col=aad, cex.main=0.7, | ||||||
|      main="Principal Component Analysis of Amino Acid Features") |      main="Principal Component Analysis of Amino Acid Features") | ||||||
| text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088") | text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088") | ||||||
|  |  | ||||||
| # This matches the intuition rather well in that "similar" amino acids are close | # This matches the intuition rather well in that "similar" amino acids are close | ||||||
| # on the plot. But we can't interpret the distances in terms of just one of the | # on the plot. But we can't interpret the distances in terms of just one of the | ||||||
| # parameters. Whatever - nature has a different way to define similarity: | # parameters. Whatever - nature has a different way to define similarity: | ||||||
| # mutations to similar amino acids are less likely to break the protein. | # mutations to similar amino acids are less likely to break the protein. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Mutation Data matrix  ================================================ | # =    2  Mutation Data matrix  ================================================ | ||||||
|  |  | ||||||
| # A mutation data matrix encodes all amino acid pairscores in a matrix. | # A mutation data matrix encodes all amino acid pairscores in a matrix. | ||||||
|  |  | ||||||
| # The Biostrings package contains the most common mutation data matrices. | # The Biostrings package contains the most common mutation data matrices. | ||||||
|  |  | ||||||
| if (! requireNamespace("BiocManager", quietly=TRUE)) { | if (! requireNamespace("BiocManager", quietly=TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
| if (! requireNamespace("Biostrings", quietly=TRUE)) { | if (! requireNamespace("Biostrings", quietly=TRUE)) { | ||||||
|   BiocManager::install("Biostrings") |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help=Biostrings)       # basic information | #  library(help=Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")  # available vignettes | #  browseVignettes("Biostrings")  # available vignettes | ||||||
| #  data(package = "Biostrings")   # available datasets | #  data(package = "Biostrings")   # available datasets | ||||||
|  |  | ||||||
| # Let's attach the BLOSUM62 mutation data matrix from the package | # Let's attach the BLOSUM62 mutation data matrix from the package | ||||||
| data(BLOSUM62, package = "Biostrings") | data(BLOSUM62, package = "Biostrings") | ||||||
|  |  | ||||||
| # ... and see what it contains. (You've seen this matrix before.) | # ... and see what it contains. (You've seen this matrix before.) | ||||||
| BLOSUM62 | BLOSUM62 | ||||||
|  |  | ||||||
| # We can simply access values via the row/column names. | # We can simply access values via the row/column names. | ||||||
| # Identical amino acids have high scores ... | # Identical amino acids have high scores ... | ||||||
| BLOSUM62["H", "H"]   # Score for a pair of two histidines | BLOSUM62["H", "H"]   # Score for a pair of two histidines | ||||||
| BLOSUM62["S", "S"]   # Score for a pair of two serines | BLOSUM62["S", "S"]   # Score for a pair of two serines | ||||||
|  |  | ||||||
| # Similar amino acids have low positive scores ... | # Similar amino acids have low positive scores ... | ||||||
| BLOSUM62["L", "I"]   # Score for a leucine / lysine pair | BLOSUM62["L", "I"]   # Score for a leucine / lysine pair | ||||||
| BLOSUM62["F", "Y"]   # etc. | BLOSUM62["F", "Y"]   # etc. | ||||||
|  |  | ||||||
| # Dissimilar amino acids have negative scores ... | # Dissimilar amino acids have negative scores ... | ||||||
| BLOSUM62["L", "K"]   # Score for a leucine / lysine pair | BLOSUM62["L", "K"]   # Score for a leucine / lysine pair | ||||||
| BLOSUM62["Q", "P"]   # etc. | BLOSUM62["Q", "P"]   # etc. | ||||||
|  |  | ||||||
|  |  | ||||||
| BLOSUM62["R", "W"]   # the matrix is symmetric! | BLOSUM62["R", "W"]   # the matrix is symmetric! | ||||||
| BLOSUM62["W", "R"] | BLOSUM62["W", "R"] | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Background score  ==================================================== | # =    3  Background score  ==================================================== | ||||||
|  |  | ||||||
| # The mutation data matrix is designed to give high scores to homologous | # The mutation data matrix is designed to give high scores to homologous | ||||||
| # sequences, low scores to non-homologous sequences. What score on average | # sequences, low scores to non-homologous sequences. What score on average | ||||||
| # should we expect for a random sequence? | # should we expect for a random sequence? | ||||||
|  |  | ||||||
| # If we sample amino acid pairs at random, we will get a score that is the | # If we sample amino acid pairs at random, we will get a score that is the | ||||||
| # average of the individual pairscores in the matrix. Omitting the ambiguity | # average of the individual pairscores in the matrix. Omitting the ambiguity | ||||||
| # codes and the gap character: | # codes and the gap character: | ||||||
|  |  | ||||||
| sum(BLOSUM62[1:20, 1:20])/400 | sum(BLOSUM62[1:20, 1:20])/400 | ||||||
|  |  | ||||||
| # But that score could be higher for real sequences, for which the amino acid | # But that score could be higher for real sequences, for which the amino acid | ||||||
| # distribution is not random. For example membrane proteins have a large number | # distribution is not random. For example membrane proteins have a large number | ||||||
| # of hydrophobic residues - an alignment of unrelated proteins might produce | # of hydrophobic residues - an alignment of unrelated proteins might produce | ||||||
| # positive scores. And there are other proteins with biased amino acid | # positive scores. And there are other proteins with biased amino acid | ||||||
| # compositions, in particular poteins that interact with multiple other | # compositions, in particular poteins that interact with multiple other | ||||||
| # proteins. Let's test how this impacts the background score by comparing a | # proteins. Let's test how this impacts the background score by comparing a | ||||||
| # sequence with shuffled sequences. These have the same composition, but are | # sequence with shuffled sequences. These have the same composition, but are | ||||||
| # obvioulsy not homologous. The data directory contains the FASTA file for the | # obvioulsy not homologous. The data directory contains the FASTA file for the | ||||||
| # PDB ID 3FG7 - a villin headpiece structure with a large amount of | # PDB ID 3FG7 - a villin headpiece structure with a large amount of | ||||||
| # low-complexity amino acid sequence ... | # low-complexity amino acid sequence ... | ||||||
|  |  | ||||||
| aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]] | aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]] | ||||||
|  |  | ||||||
| # ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C) | # ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C) | ||||||
| # with an exceptionally high percentage of hydrophobic residues. | # with an exceptionally high percentage of hydrophobic residues. | ||||||
|  |  | ||||||
| aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]] | aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]] | ||||||
|  |  | ||||||
| # Here is a function that takes two sequences and | # Here is a function that takes two sequences and | ||||||
| # returns their average pairscore. | # returns their average pairscore. | ||||||
|  |  | ||||||
| averagePairScore <- function(a, b, MDM = BLOSUM62) { | averagePairScore <- function(a, b, MDM = BLOSUM62) { | ||||||
|   # Returns average pairscore of two sequences. |   # Returns average pairscore of two sequences. | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #    a, b   chr   amino acid sequence string |   #    a, b   chr   amino acid sequence string | ||||||
|   #    MDM          mutation data matrix. Default is BLOSUM62 |   #    MDM          mutation data matrix. Default is BLOSUM62 | ||||||
|   # Value:    num   average pairscore. |   # Value:    num   average pairscore. | ||||||
|   a <- unlist(strsplit(a, "")) |   a <- unlist(strsplit(a, "")) | ||||||
|   b <- unlist(strsplit(b, "")) |   b <- unlist(strsplit(b, "")) | ||||||
|   v <- 0 |   v <- 0 | ||||||
|   for (i in seq_along(a)) { |   for (i in seq_along(a)) { | ||||||
|     v <- v + MDM[ a[i], b[i] ] |     v <- v + MDM[ a[i], b[i] ] | ||||||
|   } |   } | ||||||
|   return(v / length(a)) |   return(v / length(a)) | ||||||
| } | } | ||||||
|  |  | ||||||
| orig3FG7 <- toString(aa3FG7) | orig3FG7 <- toString(aa3FG7) | ||||||
| orig2F1C <- toString(aa2F1C) | orig2F1C <- toString(aa2F1C) | ||||||
| N <- 1000 | N <- 1000 | ||||||
| scores3FG7 <- numeric(N) | scores3FG7 <- numeric(N) | ||||||
| scores2F1C <- numeric(N) | scores2F1C <- numeric(N) | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7))) |   scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7))) | ||||||
|   scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C))) |   scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C))) | ||||||
| } | } | ||||||
|  |  | ||||||
| # Plot the distributions | # Plot the distributions | ||||||
| hist(scores3FG7, | hist(scores3FG7, | ||||||
|      col="#5599EE33", |      col="#5599EE33", | ||||||
|      breaks = seq(-1.5, 0, by=0.1), |      breaks = seq(-1.5, 0, by=0.1), | ||||||
|      main = "Pairscores for randomly shuffled sequences", |      main = "Pairscores for randomly shuffled sequences", | ||||||
|      xlab = "Average pairscore from BLOSUM 62") |      xlab = "Average pairscore from BLOSUM 62") | ||||||
| hist(scores2F1C, | hist(scores2F1C, | ||||||
|      col="#55EE9933", |      col="#55EE9933", | ||||||
|      breaks = seq(-1.5, 0, by=0.1), |      breaks = seq(-1.5, 0, by=0.1), | ||||||
|      add = TRUE) |      add = TRUE) | ||||||
| abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2) | abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2) | ||||||
| legend('topright', | legend('topright', | ||||||
|        c("3FG7 (villin)", "2F1C (OmpG)"), |        c("3FG7 (villin)", "2F1C (OmpG)"), | ||||||
|        fill = c("#5599EE33", "#55EE9933"), bty = 'n', |        fill = c("#5599EE33", "#55EE9933"), bty = 'n', | ||||||
|        inset = 0.1) |        inset = 0.1) | ||||||
|  |  | ||||||
| # This is an important result: even though we have shuffled significantly biased | # This is an important result: even though we have shuffled significantly biased | ||||||
| # sequences, and the average scores trend above the average of the mutation data | # sequences, and the average scores trend above the average of the mutation data | ||||||
| # matrix, the average scores still remain comfortably below zero. This means | # matrix, the average scores still remain comfortably below zero. This means | ||||||
| # that we can't (in general) improve a high-scoring alignment by simply | # that we can't (in general) improve a high-scoring alignment by simply | ||||||
| # extending it with randomly matched residues. We will only improve the score if | # extending it with randomly matched residues. We will only improve the score if | ||||||
| # the similarity of newly added residues is larger than what we expect to get by | # the similarity of newly added residues is larger than what we expect to get by | ||||||
| # random chance! | # random chance! | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,216 +1,216 @@ | |||||||
| # tocID <- "BIN-Data_integration.R" | # tocID <- "BIN-Data_integration.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-Data_integration unit. | #              R code accompanying the BIN-Data_integration unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2018-10  -  2020-09 | # Date:     2018-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Maintenance and updates | #           1.2    2020 Maintenance and updates | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0.1  Bugfix: UniProt ID Mapping service API change | #           1.0.1  Bugfix: UniProt ID Mapping service API change | ||||||
| #           1.0    First live version | #           1.0    First live version | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| #           Develop a fungi-specific BioMart example. | #           Develop a fungi-specific BioMart example. | ||||||
| #           (cf. | #           (cf. | ||||||
| # https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html ) | # https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html ) | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC> | #TOC> | ||||||
| #TOC>   Section  Title                             Line | #TOC>   Section  Title                             Line | ||||||
| #TOC> ------------------------------------------------- | #TOC> ------------------------------------------------- | ||||||
| #TOC>   1        Identifier mapping                  42 | #TOC>   1        Identifier mapping                  42 | ||||||
| #TOC>   2        Cross-referencing tables           165 | #TOC>   2        Cross-referencing tables           165 | ||||||
| #TOC> | #TOC> | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Identifier mapping  ================================================== | # =    1  Identifier mapping  ================================================== | ||||||
|  |  | ||||||
| # UniProt provides a well-designed ID mapping tool that can be accessed | # UniProt provides a well-designed ID mapping tool that can be accessed | ||||||
| # online at     http://www.uniprot.org/mapping/ | # online at     http://www.uniprot.org/mapping/ | ||||||
| # | # | ||||||
| # Here we will use the UniProt Web API for this tool to map identifiers. The | # Here we will use the UniProt Web API for this tool to map identifiers. The | ||||||
| # UniProt ID mapping service supports a "RESTful API": responses can be obtained | # UniProt ID mapping service supports a "RESTful API": responses can be obtained | ||||||
| # simply via a Web- browsers request. Such requests are commonly sent via the | # simply via a Web- browsers request. Such requests are commonly sent via the | ||||||
| # GET or POST verbs that a Webserver responds to, when a client asks for data. | # GET or POST verbs that a Webserver responds to, when a client asks for data. | ||||||
| # GET requests are visible in the URL of the request; POST requests are not | # GET requests are visible in the URL of the request; POST requests are not | ||||||
| # directly visible, they are commonly used to send the contents of forms, or | # directly visible, they are commonly used to send the contents of forms, or | ||||||
| # when transmitting larger, complex data items. The UniProt ID mapping sevice | # when transmitting larger, complex data items. The UniProt ID mapping sevice | ||||||
| # can accept long lists of IDs, thus using the POST mechanism makes sense. GET() | # can accept long lists of IDs, thus using the POST mechanism makes sense. GET() | ||||||
| # and  POST() functions are part of the httr package. | # and  POST() functions are part of the httr package. | ||||||
|  |  | ||||||
| # To begin, we load  httr, which supports sending and receiving data via the | # To begin, we load  httr, which supports sending and receiving data via the | ||||||
| # http protocol, just like a Web browser. | # http protocol, just like a Web browser. | ||||||
| if (! requireNamespace("httr", quietly=TRUE)) { | if (! requireNamespace("httr", quietly=TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = httr)       # basic information | #  library(help = httr)       # basic information | ||||||
| #  browseVignettes("httr")    # available vignettes | #  browseVignettes("httr")    # available vignettes | ||||||
| #  data(package = "httr")     # available datasets | #  data(package = "httr")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # We will walk through the process with the refSeqID | # We will walk through the process with the refSeqID | ||||||
| # of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what | # of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what | ||||||
| # happens if the ID can't be mapped: | # happens if the ID can't be mapped: | ||||||
| myQueryIDs <- "NP_010227 NP_00000 NP_011036" | myQueryIDs <- "NP_010227 NP_00000 NP_011036" | ||||||
|  |  | ||||||
|  |  | ||||||
| # The UniProt ID mapping service API is very straightforward to use: just define | # The UniProt ID mapping service API is very straightforward to use: just define | ||||||
| # the URL of the server and send a list of items labelled as "query" in the body | # the URL of the server and send a list of items labelled as "query" in the body | ||||||
| # of the request. GET() and POST() are functions from httr. | # of the request. GET() and POST() are functions from httr. | ||||||
|  |  | ||||||
| # Note. A recent bug in the interaction between the server expectations and the | # Note. A recent bug in the interaction between the server expectations and the | ||||||
| # curl client libraries requires the following initialization | # curl client libraries requires the following initialization | ||||||
| httr::set_config(httr::config(http_version = 0)) | httr::set_config(httr::config(http_version = 0)) | ||||||
| # cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b | # cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b | ||||||
|  |  | ||||||
|  |  | ||||||
| URL <- "https://www.uniprot.org/mapping/" | URL <- "https://www.uniprot.org/mapping/" | ||||||
| response <- httr::POST(URL, | response <- httr::POST(URL, | ||||||
|                        body = list(from = "P_REFSEQ_AC",   # Refseq Protein |                        body = list(from = "P_REFSEQ_AC",   # Refseq Protein | ||||||
|                                    to = "ACC",             # UniProt ID |                                    to = "ACC",             # UniProt ID | ||||||
|                                    format = "tab", |                                    format = "tab", | ||||||
|                                    query = myQueryIDs)) |                                    query = myQueryIDs)) | ||||||
|  |  | ||||||
| cat(httr::content(response)) | cat(httr::content(response)) | ||||||
|  |  | ||||||
| # We need to check the status code - if it is not 200, an error ocurred and we | # We need to check the status code - if it is not 200, an error ocurred and we | ||||||
| # can't process the result: | # can't process the result: | ||||||
| httr::status_code(response) | httr::status_code(response) | ||||||
|  |  | ||||||
| # If the query is successful, tabbed text is returned. We can assign that to a | # If the query is successful, tabbed text is returned. We can assign that to a | ||||||
| # data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument. | # data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument. | ||||||
|  |  | ||||||
| myMappedIDs <- read.delim(file = textConnection(httr::content(response)), | myMappedIDs <- read.delim(file = textConnection(httr::content(response)), | ||||||
|                           sep = "\t", |                           sep = "\t", | ||||||
|                           stringsAsFactors = FALSE) |                           stringsAsFactors = FALSE) | ||||||
| myMappedIDs | myMappedIDs | ||||||
|  |  | ||||||
| # If this works as expected, you should see: | # If this works as expected, you should see: | ||||||
| #        From     To | #        From     To | ||||||
| # 1 NP_010227 P39678 | # 1 NP_010227 P39678 | ||||||
| # 2 NP_011036 P25302 | # 2 NP_011036 P25302 | ||||||
| # | # | ||||||
| # ... and note that there are only two entries, because nothing was returned | # ... and note that there are only two entries, because nothing was returned | ||||||
| # for the dummy "RefSeq ID" NP_00000 | # for the dummy "RefSeq ID" NP_00000 | ||||||
|  |  | ||||||
| # If the query can't be fulfilled because of a problem with the server, a | # If the query can't be fulfilled because of a problem with the server, a | ||||||
| # WebPage is returned. But the server status is also returned and we can check | # WebPage is returned. But the server status is also returned and we can check | ||||||
| # the status code. I have lately gotten many "503" status codes: Server Not | # the status code. I have lately gotten many "503" status codes: Server Not | ||||||
| # Available... | # Available... | ||||||
|  |  | ||||||
| # We wrap this into a function: | # We wrap this into a function: | ||||||
|  |  | ||||||
| myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { | myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { | ||||||
|   # Use UniProt ID mapping service to map one or more IDs |   # Use UniProt ID mapping service to map one or more IDs | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #    s  char  A string of separated IDs |   #    s  char  A string of separated IDs | ||||||
|   #    mapFrom  char  the database in which the IDs in s are valid. Default |   #    mapFrom  char  the database in which the IDs in s are valid. Default | ||||||
|   #                     is RefSeq protein |   #                     is RefSeq protein | ||||||
|   #    mapTo    char  the database in which the target IDs are valid. Default |   #    mapTo    char  the database in which the target IDs are valid. Default | ||||||
|   #                     is UniProtKB |   #                     is UniProtKB | ||||||
|   # Value |   # Value | ||||||
|   #    a data frame of mapped IDs, with column names From and To, or an |   #    a data frame of mapped IDs, with column names From and To, or an | ||||||
|   #    empty data frame if the mapping was unsuccessful. No rows are returned |   #    empty data frame if the mapping was unsuccessful. No rows are returned | ||||||
|   #    for IDs that are not mapped. |   #    for IDs that are not mapped. | ||||||
|  |  | ||||||
|   # Initialize curl |   # Initialize curl | ||||||
|   httr::set_config(httr::config(http_version = 0)) |   httr::set_config(httr::config(http_version = 0)) | ||||||
|  |  | ||||||
|   URL <- "https://www.uniprot.org/uploadlists/" |   URL <- "https://www.uniprot.org/uploadlists/" | ||||||
|   response <- httr::POST(URL, |   response <- httr::POST(URL, | ||||||
|                          body = list(from = mapFrom, |                          body = list(from = mapFrom, | ||||||
|                                      to = mapTo, |                                      to = mapTo, | ||||||
|                                      format = "tab", |                                      format = "tab", | ||||||
|                                      query = s)) |                                      query = s)) | ||||||
|  |  | ||||||
|   if (httr::status_code(response) == 200) { # 200: oK |   if (httr::status_code(response) == 200) { # 200: oK | ||||||
|     myMap <- read.delim(file = textConnection(httr::content(response)), |     myMap <- read.delim(file = textConnection(httr::content(response)), | ||||||
|                         sep = "\t", |                         sep = "\t", | ||||||
|                         stringsAsFactors = FALSE) |                         stringsAsFactors = FALSE) | ||||||
|     colnames(myMap) <- c("From", "To") |     colnames(myMap) <- c("From", "To") | ||||||
|   } else { |   } else { | ||||||
|     myMap <- data.frame() |     myMap <- data.frame() | ||||||
|     warning(paste("No uniProt ID mapping returned:", |     warning(paste("No uniProt ID mapping returned:", | ||||||
|                   "server sent status", |                   "server sent status", | ||||||
|                   httr::status_code(response))) |                   httr::status_code(response))) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return(myMap) |   return(myMap) | ||||||
| } | } | ||||||
|  |  | ||||||
| # Try it out ... | # Try it out ... | ||||||
| myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165") | myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165") | ||||||
|  |  | ||||||
| # A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded | # A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded | ||||||
| # into your workspace on startup. | # into your workspace on startup. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Cross-referencing tables  ============================================ | # =    2  Cross-referencing tables  ============================================ | ||||||
|  |  | ||||||
| # Sometimes we get the IDs we need to map in a large table, e.g. from a list of | # Sometimes we get the IDs we need to map in a large table, e.g. from a list of | ||||||
| # genes in a model organism database such as SGD, or from the Human Genen | # genes in a model organism database such as SGD, or from the Human Genen | ||||||
| # Nomenclature commission. How do we map one set of identifiers to another one? | # Nomenclature commission. How do we map one set of identifiers to another one? | ||||||
|  |  | ||||||
| # The function to use is match(). | # The function to use is match(). | ||||||
| # Here is a tiny set of identifiers taken from a much larger table to | # Here is a tiny set of identifiers taken from a much larger table to | ||||||
| # illustrate the principle: | # illustrate the principle: | ||||||
| # | # | ||||||
|  |  | ||||||
| myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747", | myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747", | ||||||
|                               "Q08641", "P47129", "P52910", "P00330", "P81450"), |                               "Q08641", "P47129", "P52910", "P00330", "P81450"), | ||||||
|                     name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4", |                     name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4", | ||||||
|                               "AB140", "ACF4", "ACS2", "ADH1", "ATP18"), |                               "AB140", "ACF4", "ACS2", "ADH1", "ATP18"), | ||||||
|                     refID = c("NP_014657", "NP_009386", |                     refID = c("NP_014657", "NP_009386", | ||||||
|                               "NP_012683", "NP_012559", |                               "NP_012683", "NP_012559", | ||||||
|                               "NP_010038", "NP_014882", |                               "NP_010038", "NP_014882", | ||||||
|                               "NP_012616", "NP_013254", |                               "NP_012616", "NP_013254", | ||||||
|                               "NP_014555", "NP_013629")) |                               "NP_014555", "NP_013629")) | ||||||
|  |  | ||||||
| myIDs | myIDs | ||||||
|  |  | ||||||
| # Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to | # Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to | ||||||
| # their gene names. | # their gene names. | ||||||
| myQuery <- c("NP_010038", "NP_999999", "NP_013629") | myQuery <- c("NP_010038", "NP_999999", "NP_013629") | ||||||
|  |  | ||||||
| # %in% will only tell us if these IDs are present in the table: | # %in% will only tell us if these IDs are present in the table: | ||||||
| myQuery %in% myIDs$refID | myQuery %in% myIDs$refID | ||||||
|  |  | ||||||
| # ... but not where they are located. But match() does what we need here: | # ... but not where they are located. But match() does what we need here: | ||||||
| match(myQuery, myIDs$refID) | match(myQuery, myIDs$refID) | ||||||
|  |  | ||||||
| # ... and we can use the result to subset the column that we want to map to: | # ... and we can use the result to subset the column that we want to map to: | ||||||
| myIDs$name[match(myQuery, myIDs$refID)] | myIDs$name[match(myQuery, myIDs$refID)] | ||||||
|  |  | ||||||
| # Note that the output preserves the NA - i.e. the length of the mapped | # Note that the output preserves the NA - i.e. the length of the mapped | ||||||
| # values is exactly the same as the length of the query. | # values is exactly the same as the length of the query. | ||||||
|  |  | ||||||
| # task: map the three genes to their UniProt Identifier. | # task: map the three genes to their UniProt Identifier. | ||||||
|  |  | ||||||
|  |  | ||||||
| # | # | ||||||
| # Note: if you want to do very many queries in very large tables, use the | # Note: if you want to do very many queries in very large tables, use the | ||||||
| # fmatch() function in the "fastmatch" package for a considerable | # fmatch() function in the "fastmatch" package for a considerable | ||||||
| # speedup. | # speedup. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,435 +1,435 @@ | |||||||
| # tocID <- "BIN-FUNC-Domain_annotation.R" | # tocID <- "BIN-FUNC-Domain_annotation.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-FUNC-Domain_annotation unit. | #              R code accompanying the BIN-FUNC-Domain_annotation unit. | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
| # Version:  1.4 | # Version:  1.4 | ||||||
| # | # | ||||||
| # Date:     2017-11  -  2020-10 | # Date:     2017-11  -  2020-10 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.4    Add code for shared data import from the Wiki | #           1.4    Add code for shared data import from the Wiki | ||||||
| #           1.3    Add code for database export to JSON and instructions | #           1.3    Add code for database export to JSON and instructions | ||||||
| #                  for uploading annotations to the Public Student Wiki page | #                  for uploading annotations to the Public Student Wiki page | ||||||
| #           1.2    Consistently: data in ./myScripts/ ; | #           1.2    Consistently: data in ./myScripts/ ; | ||||||
| #                    begin SHARING DATA section | #                    begin SHARING DATA section | ||||||
| #           1.1    2020 Updates | #           1.1    2020 Updates | ||||||
| #           1.0    Live version 2017 | #           1.0    Live version 2017 | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| #           Put the domain plot into a function | #           Put the domain plot into a function | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                 Line | #TOC>   Section  Title                                                 Line | ||||||
| #TOC> --------------------------------------------------------------------- | #TOC> --------------------------------------------------------------------- | ||||||
| #TOC>   1        Update your database script                             51 | #TOC>   1        Update your database script                             51 | ||||||
| #TOC>   1.1        Preparing an annotation file ...                      58 | #TOC>   1.1        Preparing an annotation file ...                      58 | ||||||
| #TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61 | #TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61 | ||||||
| #TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109 | #TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109 | ||||||
| #TOC>   1.2        Execute and Validate                                 136 | #TOC>   1.2        Execute and Validate                                 136 | ||||||
| #TOC>   2        Plot Annotations                                       161 | #TOC>   2        Plot Annotations                                       161 | ||||||
| #TOC>   3        SHARING DATA                                           287 | #TOC>   3        SHARING DATA                                           287 | ||||||
| #TOC>   3.1        Post MBP1_MYSPE as JSON data                         303 | #TOC>   3.1        Post MBP1_MYSPE as JSON data                         303 | ||||||
| #TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326 | #TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Update your database script  ========================================= | # =    1  Update your database script  ========================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| # Since you have recorded domain features at the SMART database, we can store | # Since you have recorded domain features at the SMART database, we can store | ||||||
| # the feature annotations in myDB ... | # the feature annotations in myDB ... | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.1  Preparing an annotation file ...  ================================== | # ==   1.1  Preparing an annotation file ...  ================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment" | # ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment" | ||||||
| # | # | ||||||
| #   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: | #   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: | ||||||
| # | # | ||||||
| #   You DON'T already have a file called "<MYSPE>-Annotations.json" in the | #   You DON'T already have a file called "<MYSPE>-Annotations.json" in the | ||||||
| #   ./myScripts/ directory: | #   ./myScripts/ directory: | ||||||
| # | # | ||||||
| #   - Make a copy of the file "./data/refAnnotations.json" and put it in your | #   - Make a copy of the file "./data/refAnnotations.json" and put it in your | ||||||
| #     myScripts/ directory. | #     myScripts/ directory. | ||||||
| # | # | ||||||
| #   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. | #   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g. | ||||||
| #     if MYSPE is called "Crptycoccus neoformans", your file should be called | #     if MYSPE is called "Crptycoccus neoformans", your file should be called | ||||||
| #     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is | #     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is | ||||||
| #     "MBP1_CRYNE"). | #     "MBP1_CRYNE"). | ||||||
| # | # | ||||||
| #   - Open the file in the RStudio editor and delete all blocks for | #   - Open the file in the RStudio editor and delete all blocks for | ||||||
| #     the Mbp1 protein annotations except the first one. | #     the Mbp1 protein annotations except the first one. | ||||||
| # | # | ||||||
| #   - From that block, delete all lines that have annotations you did not | #   - From that block, delete all lines that have annotations you did not | ||||||
| #     find in SMART for MBP1_MYSPE. | #     find in SMART for MBP1_MYSPE. | ||||||
| # | # | ||||||
| #   - Make enough copies of the "Ankyrin fold" and "low complexity" region | #   - Make enough copies of the "Ankyrin fold" and "low complexity" region | ||||||
| #     lines to have a line for each feature you found. | #     lines to have a line for each feature you found. | ||||||
| # | # | ||||||
| #   - Then delete the comma at the end of the last line. | #   - Then delete the comma at the end of the last line. | ||||||
| # | # | ||||||
| #   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere | #   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere | ||||||
| #     and change the "start" and "end" features to the coordinates you | #     and change the "start" and "end" features to the coordinates you | ||||||
| #     recorded in the SMART database. | #     recorded in the SMART database. | ||||||
| # | # | ||||||
| #   - Save your file in the ./myScripts/ folder. | #   - Save your file in the ./myScripts/ folder. | ||||||
| # | # | ||||||
| #   - Validate your file online at https://jsonlint.com/ | #   - Validate your file online at https://jsonlint.com/ | ||||||
| # | # | ||||||
| #   - Update your "./myScripts/makeProteinDB.R" script to load your new | #   - Update your "./myScripts/makeProteinDB.R" script to load your new | ||||||
| #     annotation when you recreate the database. Open the script in the | #     annotation when you recreate the database. Open the script in the | ||||||
| #     RStudio editor, and add the following command at the end: | #     RStudio editor, and add the following command at the end: | ||||||
| # | # | ||||||
| #     myDB <- dbAddAnnotation(myDB, | #     myDB <- dbAddAnnotation(myDB, | ||||||
| #         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) | #         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json")) | ||||||
| #                                         ^^^^^^^ | #                                         ^^^^^^^ | ||||||
| #                                        edit this! | #                                        edit this! | ||||||
| # | # | ||||||
| #   - save and close the file. | #   - save and close the file. | ||||||
| # | # | ||||||
| # Then SKIP the next section. | # Then SKIP the next section. | ||||||
| # | # | ||||||
| # | # | ||||||
| # ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"   | # ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"   | ||||||
| # | # | ||||||
| #   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: | #   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT: | ||||||
| # | # | ||||||
| #   You SHOULD have a file called "<MYSPE>-Annotations.json" in the | #   You SHOULD have a file called "<MYSPE>-Annotations.json" in the | ||||||
| #  ./myScripts/ directory: | #  ./myScripts/ directory: | ||||||
| # | # | ||||||
| #   - Open the file in the RStudio editor. | #   - Open the file in the RStudio editor. | ||||||
| # | # | ||||||
| #   - Make as many copies of the "APSES fold" line as you have found | #   - Make as many copies of the "APSES fold" line as you have found | ||||||
| #     features in SMART. | #     features in SMART. | ||||||
| # | # | ||||||
| #   - Add a comma after every line except for the last one | #   - Add a comma after every line except for the last one | ||||||
| # | # | ||||||
| #   - Edit the annotations but include only features that are in the | #   - Edit the annotations but include only features that are in the | ||||||
| #     myDB$feature table. Check which features are in the database by executing | #     myDB$feature table. Check which features are in the database by executing | ||||||
| # | # | ||||||
| #        myDB$feature$name | #        myDB$feature$name | ||||||
| # | # | ||||||
| #   - Update the "start" and "end" coordinates for each feature to the | #   - Update the "start" and "end" coordinates for each feature to the | ||||||
| #     values you found. | #     values you found. | ||||||
| # | # | ||||||
| #   - Save your file. | #   - Save your file. | ||||||
| # | # | ||||||
| #   - Validate your file online at https://jsonlint.com/ | #   - Validate your file online at https://jsonlint.com/ | ||||||
| # | # | ||||||
| # | # | ||||||
| # ==   1.2  Execute and Validate  ============================================== | # ==   1.2  Execute and Validate  ============================================== | ||||||
| # | # | ||||||
| #   - source() your database creation script: | #   - source() your database creation script: | ||||||
| # | # | ||||||
| #     source("./myScripts/makeProteinDB.R") | #     source("./myScripts/makeProteinDB.R") | ||||||
| # | # | ||||||
| #     This should run without errors or warnings. If it doesn't work and you | #     This should run without errors or warnings. If it doesn't work and you | ||||||
| #     can't figure out quickly what's happening, ask for help on the | #     can't figure out quickly what's happening, ask for help on the | ||||||
| #     Discussion Board. | #     Discussion Board. | ||||||
| # | # | ||||||
| #   - Confirm | #   - Confirm | ||||||
| #     The following commands should retrieve all of the features that have been | #     The following commands should retrieve all of the features that have been | ||||||
| #     annotated for MBP1_MYSPE | #     annotated for MBP1_MYSPE | ||||||
|  |  | ||||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||||
|  |  | ||||||
| (proID  <- myDB$protein$ID[sel]) | (proID  <- myDB$protein$ID[sel]) | ||||||
| (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID]) | (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID]) | ||||||
| (ftrIDs <- unique(myDB$annotation$featureID[fanIDs])) | (ftrIDs <- unique(myDB$annotation$featureID[fanIDs])) | ||||||
| myDB$feature$name[ftrIDs] # This should list ALL of your annotated features | myDB$feature$name[ftrIDs] # This should list ALL of your annotated features | ||||||
|                           # (once). If not, consider what could have gone wrong |                           # (once). If not, consider what could have gone wrong | ||||||
|                           # and ask on the list if you have difficulties fixing |                           # and ask on the list if you have difficulties fixing | ||||||
|                           # it. |                           # it. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Plot Annotations  ==================================================== | # =    2  Plot Annotations  ==================================================== | ||||||
|  |  | ||||||
| # In this section we will plot domain annotations as colored rectangles on a | # In this section we will plot domain annotations as colored rectangles on a | ||||||
| # sequence, as an example of using the R plotting system for generic, data | # sequence, as an example of using the R plotting system for generic, data | ||||||
| # driven images. | # driven images. | ||||||
|  |  | ||||||
| # We need a small utility function that draws the annotation boxes on a | # We need a small utility function that draws the annotation boxes on a | ||||||
| # representation of sequence. It should accept the start and end coordinates, | # representation of sequence. It should accept the start and end coordinates, | ||||||
| # the y value where it should be plotted and the color of the box, and plot a | # the y value where it should be plotted and the color of the box, and plot a | ||||||
| # rectangle using R's rect() function. | # rectangle using R's rect() function. | ||||||
|  |  | ||||||
| drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) { | drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) { | ||||||
|   # Draw a box from xStart to xEnd at y, filled with colour myCol |   # Draw a box from xStart to xEnd at y, filled with colour myCol | ||||||
|   # The height of the box is y +- DELTA |   # The height of the box is y +- DELTA | ||||||
|   rect(xStart, (y - DELTA), xEnd, (y + DELTA), |   rect(xStart, (y - DELTA), xEnd, (y + DELTA), | ||||||
|        border = "black", col = myCol) |        border = "black", col = myCol) | ||||||
| } | } | ||||||
|  |  | ||||||
| # test this: | # test this: | ||||||
| plot(c(-1.5, 1.5), c(0, 0), type = "l") | plot(c(-1.5, 1.5), c(0, 0), type = "l") | ||||||
| drawBox(-1, 1, 0.0, "peachpuff") | drawBox(-1, 1, 0.0, "peachpuff") | ||||||
|  |  | ||||||
| # Next, we define a function to plot annotations for one protein: the name of | # Next, we define a function to plot annotations for one protein: the name of | ||||||
| # the protein, a horizontal grey line for its length, and all of its features. | # the protein, a horizontal grey line for its length, and all of its features. | ||||||
|  |  | ||||||
| plotProtein <- function(DB, name, y) { | plotProtein <- function(DB, name, y) { | ||||||
|   # DB: protein database |   # DB: protein database | ||||||
|   # name: the name of the protein in the database. |   # name: the name of the protein in the database. | ||||||
|   # y: height where to draw the plot |   # y: height where to draw the plot | ||||||
|   # |   # | ||||||
|   # Define colors: we create a vector of color values, one for |   # Define colors: we create a vector of color values, one for | ||||||
|   # each feature, and we give it names of the feature ID. Then we |   # each feature, and we give it names of the feature ID. Then we | ||||||
|   # can easily get the color value from the feature name. |   # can easily get the color value from the feature name. | ||||||
|   # A: make a vector of color values. The syntax may appear unusual - |   # A: make a vector of color values. The syntax may appear unusual - | ||||||
|   #    colorRampPalette() returns a function, and we simply append |   #    colorRampPalette() returns a function, and we simply append | ||||||
|   #    the parameter (number-of-features) without assigning the function |   #    the parameter (number-of-features) without assigning the function | ||||||
|   #    to its own variable name. |   #    to its own variable name. | ||||||
|   ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00", |   ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00", | ||||||
|                                "#62C923", "#0A9A9B", "#1958C3", |                                "#62C923", "#0A9A9B", "#1958C3", | ||||||
|                                "#8000D3", "#D0007F"), |                                "#8000D3", "#D0007F"), | ||||||
|                              space="Lab", |                              space="Lab", | ||||||
|                              interpolate="linear")(nrow(DB$feature)) |                              interpolate="linear")(nrow(DB$feature)) | ||||||
|   # B: Features may overlap, so we make the colors transparent by setting |   # B: Features may overlap, so we make the colors transparent by setting | ||||||
|   #    their "alpha channel" to 1/3  (hex: 55) |   #    their "alpha channel" to 1/3  (hex: 55) | ||||||
|   ftrCol <- paste0(ftrCol, "55") |   ftrCol <- paste0(ftrCol, "55") | ||||||
|   # C: we asssign names |   # C: we asssign names | ||||||
|   names(ftrCol) <- DB$feature$ID |   names(ftrCol) <- DB$feature$ID | ||||||
|   # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ] |   # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ] | ||||||
|  |  | ||||||
|   # find the row-index of the protein ID in the protein table of DB |   # find the row-index of the protein ID in the protein table of DB | ||||||
|   iProtein <- which(DB$protein$name == name) |   iProtein <- which(DB$protein$name == name) | ||||||
|  |  | ||||||
|   # write the name of the protein |   # write the name of the protein | ||||||
|   text(-30, y, adj=1, labels=name, cex=0.75 ) |   text(-30, y, adj=1, labels=name, cex=0.75 ) | ||||||
|  |  | ||||||
|   #draw a line from 0 to nchar(sequence-of-the-protein) |   #draw a line from 0 to nchar(sequence-of-the-protein) | ||||||
|   lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y), |   lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y), | ||||||
|         lwd=3, col="#999999") |         lwd=3, col="#999999") | ||||||
|  |  | ||||||
|   # get the rows of feature annotations for the protein |   # get the rows of feature annotations for the protein | ||||||
|   iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein]) |   iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein]) | ||||||
|  |  | ||||||
|   # draw a colored box for each feature |   # draw a colored box for each feature | ||||||
|   for (i in iFtr) { |   for (i in iFtr) { | ||||||
|     drawBox(DB$annotation$start[i], |     drawBox(DB$annotation$start[i], | ||||||
|             DB$annotation$end[i], |             DB$annotation$end[i], | ||||||
|             y, |             y, | ||||||
|             ftrCol[ DB$annotation$featureID[i] ]) |             ftrCol[ DB$annotation$featureID[i] ]) | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| # Plot each annotated protein: | # Plot each annotated protein: | ||||||
| # Get the rows of all unique annotated Mbp1 proteins in myDB | # Get the rows of all unique annotated Mbp1 proteins in myDB | ||||||
|  |  | ||||||
| iRows <- grep("^MBP1_", myDB$protein$name) | iRows <- grep("^MBP1_", myDB$protein$name) | ||||||
|  |  | ||||||
| # define the size of the plot-frame to accomodate all proteins | # define the size of the plot-frame to accomodate all proteins | ||||||
| yMax <- length(iRows) * 1.1 | yMax <- length(iRows) * 1.1 | ||||||
| xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence | xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence | ||||||
|  |  | ||||||
| # plot an empty frame | # plot an empty frame | ||||||
| oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and | oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and | ||||||
|                                         # decrease margins |                                         # decrease margins | ||||||
| plot(1, 1, | plot(1, 1, | ||||||
|      xlim = c(-200, xMax + 100), |      xlim = c(-200, xMax + 100), | ||||||
|      ylim = c(0, yMax), |      ylim = c(0, yMax), | ||||||
|      type = "n", |      type = "n", | ||||||
|      axes = FALSE, |      axes = FALSE, | ||||||
|      bty = "n", |      bty = "n", | ||||||
|      main = "Mbp1 orthologue domain annotations", |      main = "Mbp1 orthologue domain annotations", | ||||||
|      xlab = "sequence position", |      xlab = "sequence position", | ||||||
|      cex.axis = 0.8, |      cex.axis = 0.8, | ||||||
|      ylab="") |      ylab="") | ||||||
| axis(1, at = seq(0, xMax, by = 100)) | axis(1, at = seq(0, xMax, by = 100)) | ||||||
| myCol <- colorRampPalette(c("#f2003c", "#F0A200", | myCol <- colorRampPalette(c("#f2003c", "#F0A200", | ||||||
|                             "#f0ea00", "#62C923", |                             "#f0ea00", "#62C923", | ||||||
|                             "#0A9A9B", "#1958C3", |                             "#0A9A9B", "#1958C3", | ||||||
|                             "#8000D3", "#D0007F"), |                             "#8000D3", "#D0007F"), | ||||||
|                           space="Lab", |                           space="Lab", | ||||||
|                           interpolate="linear")(nrow(myDB$feature)) |                           interpolate="linear")(nrow(myDB$feature)) | ||||||
| myCol <- paste0(myCol, "55") | myCol <- paste0(myCol, "55") | ||||||
| legend(xMax - 150, 7, | legend(xMax - 150, 7, | ||||||
|        legend = myDB$feature$name, |        legend = myDB$feature$name, | ||||||
|        cex = 0.7, |        cex = 0.7, | ||||||
|        fill = myCol, |        fill = myCol, | ||||||
|        bty = "n") |        bty = "n") | ||||||
|  |  | ||||||
| # Finally, iterate over all proteins and call plotProtein() | # Finally, iterate over all proteins and call plotProtein() | ||||||
| for (i in seq_along(iRows)) { | for (i in seq_along(iRows)) { | ||||||
|   plotProtein(myDB, myDB$protein$name[iRows[i]], i) |   plotProtein(myDB, myDB$protein$name[iRows[i]], i) | ||||||
| } | } | ||||||
| par(oPar)  # reset the plot parameters | par(oPar)  # reset the plot parameters | ||||||
|  |  | ||||||
|  |  | ||||||
| # The plot shows what is variable and what is constant about the annotations in | # The plot shows what is variable and what is constant about the annotations in | ||||||
| # a group of related proteins. Your MBP1_MYSPE annotations should appear at the | # a group of related proteins. Your MBP1_MYSPE annotations should appear at the | ||||||
| # top. | # top. | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| #    Put a copy of the plot into your journal and interpret it with respect | #    Put a copy of the plot into your journal and interpret it with respect | ||||||
| #    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot. | #    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot. | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| #    It would be better to align the motif borders, at least approximately (not | #    It would be better to align the motif borders, at least approximately (not | ||||||
| #    all proteins have all motifs). How would you go about doing that? | #    all proteins have all motifs). How would you go about doing that? | ||||||
|  |  | ||||||
| # =    3  SHARING DATA  ======================================================== | # =    3  SHARING DATA  ======================================================== | ||||||
|  |  | ||||||
| # It's particularly interesting to compare such annotations across many | # It's particularly interesting to compare such annotations across many | ||||||
| # homologous proteins. I have created a page on the Student Wiki () that you can | # homologous proteins. I have created a page on the Student Wiki () that you can | ||||||
| # edit, and then download the data from the entire class directly to your | # edit, and then download the data from the entire class directly to your | ||||||
| # RStudio project. | # RStudio project. | ||||||
| # | # | ||||||
|  |  | ||||||
| # I have provided a function that extracts all information that refers to a | # I have provided a function that extracts all information that refers to a | ||||||
| # single protein from the database, and prints it out as well-formatted JSON, | # single protein from the database, and prints it out as well-formatted JSON, | ||||||
| # suitable to be pasted into our shareable Wiki-page. There is a fair amount of | # suitable to be pasted into our shareable Wiki-page. There is a fair amount of | ||||||
| # bookkeeping involved, but the code is not otherwise very enlightening so I | # bookkeeping involved, but the code is not otherwise very enlightening so I | ||||||
| # will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you | # will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you | ||||||
| # would want to have a look. | # would want to have a look. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.1  Post MBP1_MYSPE as JSON data  ====================================== | # ==   3.1  Post MBP1_MYSPE as JSON data  ====================================== | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| # ===== | # ===== | ||||||
| # 1: Run the following code: | # 1: Run the following code: | ||||||
|  |  | ||||||
| cat("{{Vspace}}", | cat("{{Vspace}}", | ||||||
|     "<!-- ==== BEGIN  PROTEIN ==== -->", |     "<!-- ==== BEGIN  PROTEIN ==== -->", | ||||||
|     "<pre class=\"protein-data\">", |     "<pre class=\"protein-data\">", | ||||||
|     dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))), |     dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))), | ||||||
|     "</pre>", |     "</pre>", | ||||||
|     "<!-- ===== END PROTEIN ====== -->", |     "<!-- ===== END PROTEIN ====== -->", | ||||||
|     "", sep = "\n" |     "", sep = "\n" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| # 2: Copy the entire output from the console. | # 2: Copy the entire output from the console. | ||||||
| # 3: Navigate to | # 3: Navigate to | ||||||
| #      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public | #      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public | ||||||
| #    ... edit the page, and paste your output at the top. | #    ... edit the page, and paste your output at the top. | ||||||
| # 4: Save your edits. | # 4: Save your edits. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================ | # ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================ | ||||||
|  |  | ||||||
| # Once we have collected a number of protein annotations, we can access the | # Once we have collected a number of protein annotations, we can access the | ||||||
| # Wiki-page and import the data into our database. The Wiki page is  an html | # Wiki-page and import the data into our database. The Wiki page is  an html | ||||||
| # document with lots of MediaWiki specific stuff - but the contents we are | # document with lots of MediaWiki specific stuff - but the contents we are | ||||||
| # interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These | # interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These | ||||||
| # work like normal HTML <pre> tags, but we have defined a special class for them | # work like normal HTML <pre> tags, but we have defined a special class for them | ||||||
| # to make it easy to parse out the contents we want. The rvest:: package in | # to make it easy to parse out the contents we want. The rvest:: package in | ||||||
| # combination with xml2:: provides us with all the tools we need for such | # combination with xml2:: provides us with all the tools we need for such | ||||||
| # "Webscraping" of data.... | # "Webscraping" of data.... | ||||||
|  |  | ||||||
| if (! requireNamespace("rvest", quietly=TRUE)) { | if (! requireNamespace("rvest", quietly=TRUE)) { | ||||||
|   install.packages("rvest") |   install.packages("rvest") | ||||||
| } | } | ||||||
|  |  | ||||||
| if (! requireNamespace("xml2", quietly=TRUE)) { | if (! requireNamespace("xml2", quietly=TRUE)) { | ||||||
|   install.packages("xml2") |   install.packages("xml2") | ||||||
| } | } | ||||||
|  |  | ||||||
| # Here's the process: | # Here's the process: | ||||||
| # The URL is an "open" page on the student Wiki. Users that are not logged in | # The URL is an "open" page on the student Wiki. Users that are not logged in | ||||||
| # can view the contents, but you can only edit if you are logged in. | # can view the contents, but you can only edit if you are logged in. | ||||||
| myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public" | myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public" | ||||||
|  |  | ||||||
| # First thing is to retrieve the HTML from the url... | # First thing is to retrieve the HTML from the url... | ||||||
| x <- xml2::read_html(myURL) | x <- xml2::read_html(myURL) | ||||||
|  |  | ||||||
| # This retrieves the page source, but that still needs to be parsed into its | # This retrieves the page source, but that still needs to be parsed into its | ||||||
| # logical elements. HTML is a subset of XML and such documents are structured as | # logical elements. HTML is a subset of XML and such documents are structured as | ||||||
| # trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes() | # trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes() | ||||||
| # parses out the document structure and then uses a so-called "xpath" expression | # parses out the document structure and then uses a so-called "xpath" expression | ||||||
| # to select nodes we are interested in. Now, xpath is one of those specialized | # to select nodes we are interested in. Now, xpath is one of those specialized | ||||||
| # languages of which there are a few more to learn than one would care for. You | # languages of which there are a few more to learn than one would care for. You | ||||||
| # MUST know how to format sprintf() expressions, and you SHOULD be competent | # MUST know how to format sprintf() expressions, and you SHOULD be competent | ||||||
| # with regular expressions. But if you want to be really competent in your work, | # with regular expressions. But if you want to be really competent in your work, | ||||||
| # basic HTML and CSS is required ... and enough knowledge about xpath to be able | # basic HTML and CSS is required ... and enough knowledge about xpath to be able | ||||||
| # to search on Stackoverflow for what you need for parsing data out of Web | # to search on Stackoverflow for what you need for parsing data out of Web | ||||||
| # documents... | # documents... | ||||||
|  |  | ||||||
| # The expression we use below is: | # The expression we use below is: | ||||||
| #   - get any node anywhere in the tree ("//*") ... | #   - get any node anywhere in the tree ("//*") ... | ||||||
| #   - that has a particular attribute("[@ ... ]"). | #   - that has a particular attribute("[@ ... ]"). | ||||||
| #   - The attribute we want is that the class of the node is "protein-data"; | #   - The attribute we want is that the class of the node is "protein-data"; | ||||||
| #      that is the class we have defined for our <pre> tags. | #      that is the class we have defined for our <pre> tags. | ||||||
| # As a result of this selection, we get a list of pointers to the document tree. | # As a result of this selection, we get a list of pointers to the document tree. | ||||||
| y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]') | y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]') | ||||||
|  |  | ||||||
| # Next we fetch the actual payload - the text - from the tree: | # Next we fetch the actual payload - the text - from the tree: | ||||||
| # rvest::html_text() gets the text from the list of pointers. The result is a | # rvest::html_text() gets the text from the list of pointers. The result is a | ||||||
| # normal list of character strings. | # normal list of character strings. | ||||||
| z <- rvest::html_text(y) | z <- rvest::html_text(y) | ||||||
|  |  | ||||||
| # Finally we can iterate over the list, and add all proteins we don't already | # Finally we can iterate over the list, and add all proteins we don't already | ||||||
| # have to our database. There may well be items that are rejected because they | # have to our database. There may well be items that are rejected because they | ||||||
| # are already present in the database - for example, unless somebody has | # are already present in the database - for example, unless somebody has | ||||||
| # annotated new features, all of the features are already there. Don't worry - | # annotated new features, all of the features are already there. Don't worry - | ||||||
| # that is intended; we don't want duplicate entries. | # that is intended; we don't want duplicate entries. | ||||||
|  |  | ||||||
| for (thisJSON in z) { | for (thisJSON in z) { | ||||||
|   thisData <- jsonlite::fromJSON(thisJSON) |   thisData <- jsonlite::fromJSON(thisJSON) | ||||||
|   if (! thisData$protein$name %in% myDB$protein$name) { |   if (! thisData$protein$name %in% myDB$protein$name) { | ||||||
|     myDB <- dbAddProtein(myDB, thisData$protein) |     myDB <- dbAddProtein(myDB, thisData$protein) | ||||||
|     myDB <- dbAddTaxonomy(myDB, thisData$taxonomy) |     myDB <- dbAddTaxonomy(myDB, thisData$taxonomy) | ||||||
|     myDB <- dbAddFeature(myDB, thisData$feature) |     myDB <- dbAddFeature(myDB, thisData$feature) | ||||||
|     myDB <- dbAddAnnotation(myDB, thisData$annotation) |     myDB <- dbAddAnnotation(myDB, thisData$annotation) | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| # Finally, we can repeat our domain plot with the results - which now includes the shared proteins: | # Finally, we can repeat our domain plot with the results - which now includes the shared proteins: | ||||||
|  |  | ||||||
| iRows <- grep("^MBP1_", myDB$protein$name) | iRows <- grep("^MBP1_", myDB$protein$name) | ||||||
| yMax <- length(iRows) * 1.1 | yMax <- length(iRows) * 1.1 | ||||||
| xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence | xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence | ||||||
|  |  | ||||||
| # plot an empty frame | # plot an empty frame | ||||||
| oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) | oPar <- par(mar = c(4.2, 0.1, 3, 0.1)) | ||||||
| plot(1, 1, | plot(1, 1, | ||||||
|      xlim = c(-200, xMax + 100), |      xlim = c(-200, xMax + 100), | ||||||
|      ylim = c(0, yMax), |      ylim = c(0, yMax), | ||||||
|      type = "n", |      type = "n", | ||||||
|      axes = FALSE, |      axes = FALSE, | ||||||
|      bty = "n", |      bty = "n", | ||||||
|      main = "Mbp1 orthologue domain annotations", |      main = "Mbp1 orthologue domain annotations", | ||||||
|      xlab = "sequence position", |      xlab = "sequence position", | ||||||
|      cex.axis = 0.8, |      cex.axis = 0.8, | ||||||
|      ylab="") |      ylab="") | ||||||
| axis(1, at = seq(0, xMax, by = 100)) | axis(1, at = seq(0, xMax, by = 100)) | ||||||
| myCol <- colorRampPalette(c("#f2003c", "#F0A200", | myCol <- colorRampPalette(c("#f2003c", "#F0A200", | ||||||
|                             "#f0ea00", "#62C923", |                             "#f0ea00", "#62C923", | ||||||
|                             "#0A9A9B", "#1958C3", |                             "#0A9A9B", "#1958C3", | ||||||
|                             "#8000D3", "#D0007F"), |                             "#8000D3", "#D0007F"), | ||||||
|                           space="Lab", |                           space="Lab", | ||||||
|                           interpolate="linear")(nrow(myDB$feature)) |                           interpolate="linear")(nrow(myDB$feature)) | ||||||
| myCol <- paste0(myCol, "55") | myCol <- paste0(myCol, "55") | ||||||
| legend(xMax - 150, 7, | legend(xMax - 150, 7, | ||||||
|        legend = myDB$feature$name, |        legend = myDB$feature$name, | ||||||
|        cex = 0.7, |        cex = 0.7, | ||||||
|        fill = myCol, |        fill = myCol, | ||||||
|        bty = "n") |        bty = "n") | ||||||
|  |  | ||||||
| for (i in seq_along(iRows)) { | for (i in seq_along(iRows)) { | ||||||
|   plotProtein(myDB, myDB$protein$name[iRows[i]], i) |   plotProtein(myDB, myDB$protein$name[iRows[i]], i) | ||||||
| } | } | ||||||
| par(oPar)  # reset the plot parameters | par(oPar)  # reset the plot parameters | ||||||
|  |  | ||||||
| # ... the more proteins we can compare, the more we learn about the | # ... the more proteins we can compare, the more we learn about the | ||||||
| # architectural principles of this family's domains. | # architectural principles of this family's domains. | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,169 +1,169 @@ | |||||||
| # tocID <- "BIN-FUNC-Semantic_similarity.R" | # tocID <- "BIN-FUNC-Semantic_similarity.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-FUNC_Semantic_similarity unit. | #              R code accompanying the BIN-FUNC_Semantic_similarity unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017-11  -  2020-09 | # Date:     2017-11  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Maintenance | #           1.2    2020 Maintenance | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #                      use Biocmanager:: not biocLite() | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0    New code. | #           1.0    New code. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                Line | #TOC>   Section  Title                                                Line | ||||||
| #TOC> -------------------------------------------------------------------- | #TOC> -------------------------------------------------------------------- | ||||||
| #TOC>   1        Preparations: Packages, AnnotationDB, Setup            43 | #TOC>   1        Preparations: Packages, AnnotationDB, Setup            43 | ||||||
| #TOC>   2        Fetch GO Annotations                                  100 | #TOC>   2        Fetch GO Annotations                                  100 | ||||||
| #TOC>   3        Semantic Similarities                                 109 | #TOC>   3        Semantic Similarities                                 109 | ||||||
| #TOC>   4        GO Term Enrichment in Gene Sets                       127 | #TOC>   4        GO Term Enrichment in Gene Sets                       127 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Preparations: Packages, AnnotationDB, Setup  ========================= | # =    1  Preparations: Packages, AnnotationDB, Setup  ========================= | ||||||
|  |  | ||||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
|  |  | ||||||
| # GOSim is an R-package in the Bioconductor project. | # GOSim is an R-package in the Bioconductor project. | ||||||
| if (! requireNamespace("GOSim", quietly = TRUE)) { | if (! requireNamespace("GOSim", quietly = TRUE)) { | ||||||
|   BiocManager::install("GOSim") |   BiocManager::install("GOSim") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = GOSim)       # basic information | #  library(help = GOSim)       # basic information | ||||||
| #  browseVignettes("GOSim")    # available vignettes | #  browseVignettes("GOSim")    # available vignettes | ||||||
| #  data(package = "GOSim")     # available datasets | #  data(package = "GOSim")     # available datasets | ||||||
|  |  | ||||||
| # GOSim makes extensive assumptions about loaded packages, and many base | # GOSim makes extensive assumptions about loaded packages, and many base | ||||||
| # methods are masked. We will thus use library(GOSim) to load it | # methods are masked. We will thus use library(GOSim) to load it | ||||||
| # in its entirety and with all packages it depends on. We will still use | # in its entirety and with all packages it depends on. We will still use | ||||||
| # the <package>::<function>() syntax in the code below, but this now serves | # the <package>::<function>() syntax in the code below, but this now serves | ||||||
| # more of a didactic purpose, rather than actual syntax requirements. | # more of a didactic purpose, rather than actual syntax requirements. | ||||||
|  |  | ||||||
| library(GOSim) | library(GOSim) | ||||||
|  |  | ||||||
| # GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast | # GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast | ||||||
| # annotations instead... | # annotations instead... | ||||||
| if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) { | if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) { | ||||||
|   BiocManager::install("org.Sc.sgd.db") |   BiocManager::install("org.Sc.sgd.db") | ||||||
| } | } | ||||||
|  |  | ||||||
| # Bioconductor annotation packages won't work stably unless we actually load | # Bioconductor annotation packages won't work stably unless we actually load | ||||||
| # them: | # them: | ||||||
| library(org.Sc.sgd.db) | library(org.Sc.sgd.db) | ||||||
|  |  | ||||||
| # org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such | # org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such | ||||||
| # databases exist for all model organisms. It's a kind of a fancy data frame | # databases exist for all model organisms. It's a kind of a fancy data frame | ||||||
| # from which we can get annotations by rows (genes) with the keys() funtion ... | # from which we can get annotations by rows (genes) with the keys() funtion ... | ||||||
| AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510] | AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510] | ||||||
|  |  | ||||||
| # ... and the types of available annotations with the columns() function | # ... and the types of available annotations with the columns() function | ||||||
| AnnotationDbi::columns(org.Sc.sgd.db) | AnnotationDbi::columns(org.Sc.sgd.db) | ||||||
|  |  | ||||||
| # Note that one of the columns is "GO" ... and we load that into the | # Note that one of the columns is "GO" ... and we load that into the | ||||||
| # datastructures used by GOSim: | # datastructures used by GOSim: | ||||||
|  |  | ||||||
| # Choose GOterms to use | # Choose GOterms to use | ||||||
| GOSim::setEvidenceLevel(evidences = "all", | GOSim::setEvidenceLevel(evidences = "all", | ||||||
|                         organism = org.Sc.sgdORGANISM, |                         organism = org.Sc.sgdORGANISM, | ||||||
|                         gomap = org.Sc.sgdGO) |                         gomap = org.Sc.sgdGO) | ||||||
|  |  | ||||||
| # Use Biological Process ontology | # Use Biological Process ontology | ||||||
| GOSim::setOntology("BP", loadIC = FALSE) | GOSim::setOntology("BP", loadIC = FALSE) | ||||||
|  |  | ||||||
| # confirm that we loaded the correct ontology | # confirm that we loaded the correct ontology | ||||||
| head(get("gomap", envir = GOSimEnv)) | head(get("gomap", envir = GOSimEnv)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Fetch GO Annotations  ================================================ | # =    2  Fetch GO Annotations  ================================================ | ||||||
|  |  | ||||||
|  |  | ||||||
| # All keys being used here are yeast systematic names. | # All keys being used here are yeast systematic names. | ||||||
|  |  | ||||||
| # Get one set of annotations | # Get one set of annotations | ||||||
| GOSim::getGOInfo(c("YDL056W"))  # Mbp1 | GOSim::getGOInfo(c("YDL056W"))  # Mbp1 | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Semantic Similarities  =============================================== | # =    3  Semantic Similarities  =============================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Get semantic similarities between genes | # Get semantic similarities between genes | ||||||
| ?getGeneSim | ?getGeneSim | ||||||
|  |  | ||||||
| # There are _many_ different metrics of term similarity implemented | # There are _many_ different metrics of term similarity implemented | ||||||
| # in this package. | # in this package. | ||||||
|  |  | ||||||
|                                                          # Mbp1 and... |                                                          # Mbp1 and... | ||||||
| GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex | GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex | ||||||
| GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators | GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators | ||||||
| GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator | GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator | ||||||
| GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist | GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist | ||||||
| GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist | GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist | ||||||
| GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis | GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  GO Term Enrichment in Gene Sets  ===================================== | # =    4  GO Term Enrichment in Gene Sets  ===================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Calculating GO term enrichment in gene sets is done with the Bioconductor | # Calculating GO term enrichment in gene sets is done with the Bioconductor | ||||||
| # topGO package. | # topGO package. | ||||||
| if (! requireNamespace("topGO", quietly = TRUE)) { | if (! requireNamespace("topGO", quietly = TRUE)) { | ||||||
|   BiocManager::install("topGO") |   BiocManager::install("topGO") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = topGO)       # basic information | #  library(help = topGO)       # basic information | ||||||
| #  browseVignettes("topGO")    # available vignettes | #  browseVignettes("topGO")    # available vignettes | ||||||
| #  data(package = "topGO")     # available datasets | #  data(package = "topGO")     # available datasets | ||||||
|  |  | ||||||
| # Once again - assumptions are made by GOsim that require us to load the | # Once again - assumptions are made by GOsim that require us to load the | ||||||
| # topGO package wholesale: | # topGO package wholesale: | ||||||
| library(topGO) | library(topGO) | ||||||
|  |  | ||||||
| # Let's define a gene set: GOterm enrichment for G1/S switch activators: | # Let's define a gene set: GOterm enrichment for G1/S switch activators: | ||||||
| mySet <- c("YFR028C", # Cdc14 | mySet <- c("YFR028C", # Cdc14 | ||||||
|            "YDL056W", # Mbp1 |            "YDL056W", # Mbp1 | ||||||
|            "YLR182W", # Swi6 |            "YLR182W", # Swi6 | ||||||
|            "YER111C", # Swi4 |            "YER111C", # Swi4 | ||||||
|            "YOR083W", # Whi5 |            "YOR083W", # Whi5 | ||||||
|            "YBR160W", # Cdc28 |            "YBR160W", # Cdc28 | ||||||
|            "YMR199W", # Cln1 |            "YMR199W", # Cln1 | ||||||
|            "YPL256C", # Cln2 |            "YPL256C", # Cln2 | ||||||
|            "YAL040C") # Cln3 |            "YAL040C") # Cln3 | ||||||
|  |  | ||||||
| allGenes <- AnnotationDbi::keys(org.Sc.sgd.db) | allGenes <- AnnotationDbi::keys(org.Sc.sgd.db) | ||||||
| allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which | allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which | ||||||
|                                             # we define enrichment |                                             # we define enrichment | ||||||
|  |  | ||||||
| myEnr <- GOenrichment(mySet, allGenes) | myEnr <- GOenrichment(mySet, allGenes) | ||||||
|  |  | ||||||
| sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ... | sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ... | ||||||
|  |  | ||||||
| #Most significantly enriched is GO:0071931. What is this? | #Most significantly enriched is GO:0071931. What is this? | ||||||
| annotate::getGOTerm("GO:0071931")  # ... makes sense. | annotate::getGOTerm("GO:0071931")  # ... makes sense. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										702
									
								
								BIN-MYSPE.R
									
									
									
									
									
								
							
							
						
						
									
										702
									
								
								BIN-MYSPE.R
									
									
									
									
									
								
							| @@ -1,351 +1,351 @@ | |||||||
| # tocID <- "BIN-MYSPE.R" | # tocID <- "BIN-MYSPE.R" | ||||||
| # | # | ||||||
| # Purpose: A Bioinformatics Course: | # Purpose: A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-MYSPE unit | #              R code accompanying the BIN-MYSPE unit | ||||||
| # | # | ||||||
| # | # | ||||||
| # Version: 1.4 | # Version: 1.4 | ||||||
| # | # | ||||||
| # Date:    2017-09 - 2021-10 | # Date:    2017-09 - 2021-10 | ||||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # V 1.4    Add troubleshooting hints via errText[[...]] | # V 1.4    Add troubleshooting hints via errText[[...]] | ||||||
| # V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about | # V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about | ||||||
| # V 1.2    Reorganized proportional plot section into a "further reading" | # V 1.2    Reorganized proportional plot section into a "further reading" | ||||||
| #          section, added nested-box, and sankey plot visualization of | #          section, added nested-box, and sankey plot visualization of | ||||||
| #          proportions. Introduced plotly. | #          proportions. Introduced plotly. | ||||||
| # V 1.1    2020 Workflow changes | # V 1.1    2020 Workflow changes | ||||||
| # V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory | # V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory | ||||||
| # V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist | # V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist | ||||||
| # V 0.1    First code copied from BCH441_A03_makeMYSPElist.R | # V 0.1    First code copied from BCH441_A03_makeMYSPElist.R | ||||||
| # | # | ||||||
| # TODO:    Sample solution for sankey plot function. | # TODO:    Sample solution for sankey plot function. | ||||||
| # | # | ||||||
| # | # | ||||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||||
| # | # | ||||||
| # DO NOT SIMPLY  source()  THESE FILES! | # DO NOT SIMPLY  source()  THESE FILES! | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| #  going on. That's not how it works ... | #  going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                             Line | #TOC>   Section  Title                                             Line | ||||||
| #TOC> ----------------------------------------------------------------- | #TOC> ----------------------------------------------------------------- | ||||||
| #TOC>   1        PREPARATIONS                                        52 | #TOC>   1        PREPARATIONS                                        52 | ||||||
| #TOC>   2        SUITABLE MYSPE SPECIES                              65 | #TOC>   2        SUITABLE MYSPE SPECIES                              65 | ||||||
| #TOC>   3        ADOPT "MYSPE"                                       89 | #TOC>   3        ADOPT "MYSPE"                                       89 | ||||||
| #TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128 | #TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128 | ||||||
| #TOC>   4.1        Percentages                                      146 | #TOC>   4.1        Percentages                                      146 | ||||||
| #TOC>   4.2        Visualizing proportions: Pie chart               165 | #TOC>   4.2        Visualizing proportions: Pie chart               165 | ||||||
| #TOC>   4.3        Visualizing proportions: Nested squares          243 | #TOC>   4.3        Visualizing proportions: Nested squares          243 | ||||||
| #TOC>   4.4        Visualizing proportions: Sankey diagrams         280 | #TOC>   4.4        Visualizing proportions: Sankey diagrams         280 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  PREPARATIONS  ======================================================== | # =    1  PREPARATIONS  ======================================================== | ||||||
| # | # | ||||||
|  |  | ||||||
| # Execute the two conditionals below: | # Execute the two conditionals below: | ||||||
| if (! file.exists("./myScripts/.myProfile.R")) { | if (! file.exists("./myScripts/.myProfile.R")) { | ||||||
|   stop(errText[["noProfileFile"]])     # message defined in .Rprofile |   stop(errText[["noProfileFile"]])     # message defined in .Rprofile | ||||||
| } | } | ||||||
|  |  | ||||||
| if (! exists("myStudentNumber")) { | if (! exists("myStudentNumber")) { | ||||||
|   stop(errText[["noStudentNumber"]])   # message defined in .Rprofile |   stop(errText[["noStudentNumber"]])   # message defined in .Rprofile | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  SUITABLE MYSPE SPECIES  ============================================== | # =    2  SUITABLE MYSPE SPECIES  ============================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # In this unit we will select one species from a list of genome sequenced fungi | # In this unit we will select one species from a list of genome sequenced fungi | ||||||
| # and write it into your personalized profile file. This species will be called | # and write it into your personalized profile file. This species will be called | ||||||
| # "MYSPE" (My Species) for other learning units and exercises. | # "MYSPE" (My Species) for other learning units and exercises. | ||||||
|  |  | ||||||
| # A detailed description of the process of compiling the list of genome | # A detailed description of the process of compiling the list of genome | ||||||
| # sequenced fungi with protein annotations and Mbp1 homologues is in the file | # sequenced fungi with protein annotations and Mbp1 homologues is in the file | ||||||
| # ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi | # ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi | ||||||
| # was retrieved from https://fungi.ensembl.org; a search for homologues to | # was retrieved from https://fungi.ensembl.org; a search for homologues to | ||||||
| # yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged. | # yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged. | ||||||
| # A representative organism at each genus-level was chosen from those hits | # A representative organism at each genus-level was chosen from those hits | ||||||
| # that actual;ly have a homologue. Finally, a mapping table was constructed to | # that actual;ly have a homologue. Finally, a mapping table was constructed to | ||||||
| # asymmetrically retrieve unique species: a student number will retrieve | # asymmetrically retrieve unique species: a student number will retrieve | ||||||
| # a species, but (public) knowledge of the species cannot reconstruct the | # a species, but (public) knowledge of the species cannot reconstruct the | ||||||
| # student number. | # student number. | ||||||
|  |  | ||||||
| # Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow | # Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow | ||||||
| #       of selecting and combining data from various data resources. Studying | #       of selecting and combining data from various data resources. Studying | ||||||
| #       it will give you a better sense of how such workflows can be | #       it will give you a better sense of how such workflows can be | ||||||
| #       implemented in practice. | #       implemented in practice. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  ADOPT "MYSPE"  ======================================================= | # =    3  ADOPT "MYSPE"  ======================================================= | ||||||
|  |  | ||||||
| # Execute: | # Execute: | ||||||
| ( MYSPE <- getMYSPE(myStudentNumber) ) | ( MYSPE <- getMYSPE(myStudentNumber) ) | ||||||
|  |  | ||||||
| # If this produced an error, this session has not been properly set up. You | # If this produced an error, this session has not been properly set up. You | ||||||
| # may not yet have run  init()  and edited  .myProfile.R , or that file is not | # may not yet have run  init()  and edited  .myProfile.R , or that file is not | ||||||
| # in your  myScripts/  folder. Fix this, and execute: | # in your  myScripts/  folder. Fix this, and execute: | ||||||
| # | # | ||||||
| #    source(".Rprofile") . | #    source(".Rprofile") . | ||||||
|  |  | ||||||
| # If this produced NA, your Student Number may not be correct, or you are not in | # If this produced NA, your Student Number may not be correct, or you are not in | ||||||
| # my class-list. Contact me. Otherwise, this should have printed a species name, | # my class-list. Contact me. Otherwise, this should have printed a species name, | ||||||
| # and the taxonomy ID of its genome-sequenced strain. This is your unique | # and the taxonomy ID of its genome-sequenced strain. This is your unique | ||||||
| # speciesfor this course. Note it in your journal ... | # speciesfor this course. Note it in your journal ... | ||||||
|  |  | ||||||
| biCode(MYSPE) # and also note it's "BiCode" ... | biCode(MYSPE) # and also note it's "BiCode" ... | ||||||
| ( myTaxID <- names(MYSPE) )  # and its taxID | ( myTaxID <- names(MYSPE) )  # and its taxID | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| # ===== | # ===== | ||||||
| #   Note down the species name and its five letter BiCode on your Student | #   Note down the species name and its five letter BiCode on your Student | ||||||
| #   Wiki user page. Use this species whenever this or future assignments refer | #   Wiki user page. Use this species whenever this or future assignments refer | ||||||
| #   to MYSPE. Whenever you start a session, it will automatically be loaded | #   to MYSPE. Whenever you start a session, it will automatically be loaded | ||||||
| #   from  myScripts/.myProfile.R  and is available as  MYSPE . | #   from  myScripts/.myProfile.R  and is available as  MYSPE . | ||||||
|  |  | ||||||
| # Here is some more information about MYSPE, taken from the table of genome- | # Here is some more information about MYSPE, taken from the table of genome- | ||||||
| # sequenced fungi that is in your ./data folder. | # sequenced fungi that is in your ./data folder. | ||||||
| fungiDat <- read.csv("data/Species.csv") | fungiDat <- read.csv("data/Species.csv") | ||||||
| iMs <- which(fungiDat$Taxon.ID == myTaxID) | iMs <- which(fungiDat$Taxon.ID == myTaxID) | ||||||
|  |  | ||||||
| ( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order | ( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order | ||||||
| ( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus | ( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus | ||||||
| ( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain | ( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain | ||||||
|  |  | ||||||
| # That's all. | # That's all. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  FURTHER READING: PLOTTING PROPORTIONS  =============================== | # =    4  FURTHER READING: PLOTTING PROPORTIONS  =============================== | ||||||
|  |  | ||||||
| # The material below is an exploration of data-preparation and plotting | # The material below is an exploration of data-preparation and plotting | ||||||
| # techniques; you can treat this as additional practice and further reading and | # techniques; you can treat this as additional practice and further reading and | ||||||
| # I expect that some of the code and plotting examples may be useful in a | # I expect that some of the code and plotting examples may be useful in a | ||||||
| # different context. | # different context. | ||||||
|  |  | ||||||
| # A frequent task is to visualize the proportion of elements with given | # A frequent task is to visualize the proportion of elements with given | ||||||
| # categories in a sample. For example, we might ask what the proportion of the | # categories in a sample. For example, we might ask what the proportion of the | ||||||
| # different orders of fungi is the order of MYSPE? Let's first collect the | # different orders of fungi is the order of MYSPE? Let's first collect the | ||||||
| # numbers. | # numbers. | ||||||
|  |  | ||||||
| ( nFungi <- nrow(fungiDat) )                            # sequenced fungi | ( nFungi <- nrow(fungiDat) )                            # sequenced fungi | ||||||
| ( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE | ( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE | ||||||
| ( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE | ( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE | ||||||
| ( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE | ( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   4.1  Percentages  ======================================================= | # ==   4.1  Percentages  ======================================================= | ||||||
|  |  | ||||||
| # The zeroth-order approach to visualization is simply to print percentages: | # The zeroth-order approach to visualization is simply to print percentages: | ||||||
|  |  | ||||||
| cat(sprintf("\n%s comprise %5.2f%% of fungi.", | cat(sprintf("\n%s comprise %5.2f%% of fungi.", | ||||||
|         myOr, |         myOr, | ||||||
|         (nOrder * 100) / nFungi)) |         (nOrder * 100) / nFungi)) | ||||||
|  |  | ||||||
| # ... or, adding the actual numbers: | # ... or, adding the actual numbers: | ||||||
|  |  | ||||||
| cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).", | cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).", | ||||||
|             myOr, |             myOr, | ||||||
|             (nOrder * 100) / nFungi, |             (nOrder * 100) / nFungi, | ||||||
|             nOrder, |             nOrder, | ||||||
|             nFungi)) |             nFungi)) | ||||||
|  |  | ||||||
| # But that's hard to visualize for most of us, and anyway, we don't know how | # But that's hard to visualize for most of us, and anyway, we don't know how | ||||||
| # that relates to other orders. | # that relates to other orders. | ||||||
|  |  | ||||||
| # ==   4.2  Visualizing proportions: Pie chart  ================================ | # ==   4.2  Visualizing proportions: Pie chart  ================================ | ||||||
|  |  | ||||||
| # Often, we will use a pie chart instead. Pie charts are rather informal types | # Often, we will use a pie chart instead. Pie charts are rather informal types | ||||||
| # of plots, not well suited for analysis. But easy to do: | # of plots, not well suited for analysis. But easy to do: | ||||||
|  |  | ||||||
| # Define four colors to identify the four categories | # Define four colors to identify the four categories | ||||||
| pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0") | pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0") | ||||||
|  |  | ||||||
| oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0 | oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0 | ||||||
|                                            # and remember the |                                            # and remember the | ||||||
|                                            # previous setting |                                            # previous setting | ||||||
|  |  | ||||||
| pie(c(nSpecies,                            # subtract numbers since these | pie(c(nSpecies,                            # subtract numbers since these | ||||||
|       nGenus - nSpecies,                   # categories are mutually contained |       nGenus - nSpecies,                   # categories are mutually contained | ||||||
|       nOrder - nGenus - nSpecies,          # in each other |       nOrder - nGenus - nSpecies,          # in each other | ||||||
|       nFungi - nOrder - nGenus - nSpecies), |       nFungi - nOrder - nGenus - nSpecies), | ||||||
|       labels = "", |       labels = "", | ||||||
|       radius = 0.9, |       radius = 0.9, | ||||||
|       main = "MYSPE in genome-sequenced fungi", |       main = "MYSPE in genome-sequenced fungi", | ||||||
|       lty = 0,                             # turn borders for wedges off |       lty = 0,                             # turn borders for wedges off | ||||||
|       col = pCol, |       col = pCol, | ||||||
|       clockwise = TRUE, |       clockwise = TRUE, | ||||||
|       init.angle = 90) |       init.angle = 90) | ||||||
|  |  | ||||||
| title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot | title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot | ||||||
|  |  | ||||||
| legend(x = 0.95, y = 0.8,    # place at legend here | legend(x = 0.95, y = 0.8,    # place at legend here | ||||||
|        legend = c("Species", "Genus", "Order", "Fungi"), |        legend = c("Species", "Genus", "Order", "Fungi"), | ||||||
|        y.intersp = 2,                      # line spacing for labels |        y.intersp = 2,                      # line spacing for labels | ||||||
|        cex = 0.8,                          # character size for labels |        cex = 0.8,                          # character size for labels | ||||||
|        bty = "n",                          # "no" box around the legend |        bty = "n",                          # "no" box around the legend | ||||||
|        pt.cex = 2,                         # size of colour boxes |        pt.cex = 2,                         # size of colour boxes | ||||||
|        pch = 15,                           # a filled square |        pch = 15,                           # a filled square | ||||||
|        col = pCol) |        col = pCol) | ||||||
|  |  | ||||||
| par(oPar)                                  # reset graphics state | par(oPar)                                  # reset graphics state | ||||||
|  |  | ||||||
| # Unless MYSPE is one of the frequently sequenced species, there will only be a | # Unless MYSPE is one of the frequently sequenced species, there will only be a | ||||||
| # very thin wedge visible. Pie charts are not well suited to visualize small | # very thin wedge visible. Pie charts are not well suited to visualize small | ||||||
| # proportions. | # proportions. | ||||||
|  |  | ||||||
| # It is a little more useful if we have non-nested proportions - like the | # It is a little more useful if we have non-nested proportions - like the | ||||||
| # number of species in the same order overall: | # number of species in the same order overall: | ||||||
|  |  | ||||||
| myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE) | myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE) | ||||||
| head(myTbl) | head(myTbl) | ||||||
|  |  | ||||||
| # pie() does a reasonable job out of the box to interpret table() data: | # pie() does a reasonable job out of the box to interpret table() data: | ||||||
| pie(myTbl) | pie(myTbl) | ||||||
|  |  | ||||||
| # ... we can improve this quickly with a bit of tweaking: | # ... we can improve this quickly with a bit of tweaking: | ||||||
|  |  | ||||||
| N <- length(myTbl) | N <- length(myTbl) | ||||||
| sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere | sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere | ||||||
|  |  | ||||||
| myCol <- rep(pCol[4], N)       # N elements of pCol[1] | myCol <- rep(pCol[4], N)       # N elements of pCol[1] | ||||||
| myCol[sel] <- pCol[1]          # replace this one color | myCol[sel] <- pCol[1]          # replace this one color | ||||||
|  |  | ||||||
| myLbl <- rep("", N)            # N labels of "" | myLbl <- rep("", N)            # N labels of "" | ||||||
| myLbl[sel] <- myOr             # replace this one label with the MYSPE order | myLbl[sel] <- myOr             # replace this one label with the MYSPE order | ||||||
|  |  | ||||||
|  |  | ||||||
| oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0 | oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0 | ||||||
|  |  | ||||||
| pie(myTbl, | pie(myTbl, | ||||||
|     labels = myLbl, |     labels = myLbl, | ||||||
|     radius = 0.9, |     radius = 0.9, | ||||||
|     main = "MYSPE order", |     main = "MYSPE order", | ||||||
|     border = "#DDDDDD", |     border = "#DDDDDD", | ||||||
|     col = myCol, |     col = myCol, | ||||||
|     clockwise = TRUE, |     clockwise = TRUE, | ||||||
|     init.angle = 90) |     init.angle = 90) | ||||||
|  |  | ||||||
| par(oPar)                                  # reset graphics state | par(oPar)                                  # reset graphics state | ||||||
|  |  | ||||||
| # But the overall problem remains. | # But the overall problem remains. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   4.3  Visualizing proportions: Nested squares  =========================== | # ==   4.3  Visualizing proportions: Nested squares  =========================== | ||||||
|  |  | ||||||
| # A simple alternative is to draw such proportions as nested squares: | # A simple alternative is to draw such proportions as nested squares: | ||||||
|  |  | ||||||
| x <- sqrt(nFungi) | x <- sqrt(nFungi) | ||||||
|  |  | ||||||
| # set margins to ~ 0 and type to square | # set margins to ~ 0 and type to square | ||||||
| oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s") | oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s") | ||||||
|  |  | ||||||
| # empty, square plot | # empty, square plot | ||||||
| plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x), | plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x), | ||||||
|      type="n", axes=FALSE, xlab="", ylab="") |      type="n", axes=FALSE, xlab="", ylab="") | ||||||
|  |  | ||||||
| # basic square for all genomes | # basic square for all genomes | ||||||
| rect(0, 0, x,              x,              col = pCol[4]) | rect(0, 0, x,              x,              col = pCol[4]) | ||||||
|  |  | ||||||
| # grid | # grid | ||||||
| u <- 0:floor(x) | u <- 0:floor(x) | ||||||
| N <- length(u) | N <- length(u) | ||||||
| segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18") | segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18") | ||||||
| segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18") | segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18") | ||||||
| # each square on this grid is one genome | # each square on this grid is one genome | ||||||
|  |  | ||||||
| # colored squares | # colored squares | ||||||
| rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3]) | rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3]) | ||||||
| rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2]) | rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2]) | ||||||
| rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1]) | rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1]) | ||||||
|  |  | ||||||
| # labels | # labels | ||||||
| text(x/2, x/2,      "Fungi") | text(x/2, x/2,      "Fungi") | ||||||
| text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9) | text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9) | ||||||
| text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8) | text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8) | ||||||
| text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7) | text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7) | ||||||
|  |  | ||||||
| par(oPar)                                  # reset graphics state | par(oPar)                                  # reset graphics state | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   4.4  Visualizing proportions: Sankey diagrams  ========================== | # ==   4.4  Visualizing proportions: Sankey diagrams  ========================== | ||||||
|  |  | ||||||
| # Sankey diagrams are an excellent way to visualize complicated nested | # Sankey diagrams are an excellent way to visualize complicated nested | ||||||
| # proportions and their changes (see here for example: | # proportions and their changes (see here for example: | ||||||
| # https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple | # https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple | ||||||
| # example with the MYSPE proportions, as an illustration of the plotting | # example with the MYSPE proportions, as an illustration of the plotting | ||||||
| # principle. | # principle. | ||||||
|  |  | ||||||
| if (! requireNamespace("plotly")) { | if (! requireNamespace("plotly")) { | ||||||
|   install.packages("plotly") |   install.packages("plotly") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help   = plotly)     # basic information | #  library(help   = plotly)     # basic information | ||||||
| #  browseVignettes("plotly")    # available vignettes | #  browseVignettes("plotly")    # available vignettes | ||||||
| #  data(package  = "plotly")    # available datasets | #  data(package  = "plotly")    # available datasets | ||||||
|  |  | ||||||
| # Here, we use the plotly package that wraps a very well developed javascript | # Here, we use the plotly package that wraps a very well developed javascript | ||||||
| # library with many options for interactive plots. I am producing this plot | # library with many options for interactive plots. I am producing this plot | ||||||
| # hard-coded for the sample organism "Sporothrix schenkii"; you would need | # hard-coded for the sample organism "Sporothrix schenkii"; you would need | ||||||
| # to change the code to adapt it to your own MYSPE - or even build a function | # to change the code to adapt it to your own MYSPE - or even build a function | ||||||
| # for this. Do try this if you have a bit of coding experience, sankey diagrams | # for this. Do try this if you have a bit of coding experience, sankey diagrams | ||||||
| # are a good way to show hierarchical data relations - and if you get this | # are a good way to show hierarchical data relations - and if you get this | ||||||
| # working for your own organism you can be proud that you have understood | # working for your own organism you can be proud that you have understood | ||||||
| # how preparing the data works. | # how preparing the data works. | ||||||
|  |  | ||||||
|  |  | ||||||
| myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID | myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID | ||||||
|                           "Ophiostomatales (6)",       # 1 |                           "Ophiostomatales (6)",       # 1 | ||||||
|                           "Other...",                  # 2 |                           "Other...",                  # 2 | ||||||
|                           "Sporothrix (4)",            # 3 |                           "Sporothrix (4)",            # 3 | ||||||
|                           "Other...",                  # 4 |                           "Other...",                  # 4 | ||||||
|                           "Sporothrix schenckii (2)",  # 5 |                           "Sporothrix schenckii (2)",  # 5 | ||||||
|                           "Other..."                   # 6 |                           "Other..."                   # 6 | ||||||
|                           ), |                           ), | ||||||
|                 x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0), |                 x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0), | ||||||
|                 y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7), |                 y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7), | ||||||
|                 color = c("#f2f2f0", # |                 color = c("#f2f2f0", # | ||||||
|                           "#ffd5c4", |                           "#ffd5c4", | ||||||
|                           "#CCCCCC", |                           "#CCCCCC", | ||||||
|                           "#ff9582", |                           "#ff9582", | ||||||
|                           "#CCCCCC", |                           "#CCCCCC", | ||||||
|                           "#ed394e", |                           "#ed394e", | ||||||
|                           "#CCCCCC" |                           "#CCCCCC" | ||||||
|                           ), |                           ), | ||||||
|                 pad = 15, |                 pad = 15, | ||||||
|                 thickness = 20, |                 thickness = 20, | ||||||
|                 line = list(color = "black", |                 line = list(color = "black", | ||||||
|                             width = 0.5)) |                             width = 0.5)) | ||||||
|  |  | ||||||
| myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of | myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of | ||||||
|                 target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0 |                 target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0 | ||||||
|                 value =  c(6, 18, 4, 2, 2, 2))  # and node 1 |                 value =  c(6, 18, 4, 2, 2, 2))  # and node 1 | ||||||
|  |  | ||||||
| # Setting up the actual plot ... | # Setting up the actual plot ... | ||||||
| fig  <-  plotly::plot_ly(type = "sankey", | fig  <-  plotly::plot_ly(type = "sankey", | ||||||
|                          arrangement = "snap", |                          arrangement = "snap", | ||||||
|                          orientation = "h", |                          orientation = "h", | ||||||
|                          node = myNodes, |                          node = myNodes, | ||||||
|                          link = myLinks) |                          link = myLinks) | ||||||
|  |  | ||||||
| # Adding and adjusting a few layout parameters | # Adding and adjusting a few layout parameters | ||||||
| fig <- plotly::layout(fig, | fig <- plotly::layout(fig, | ||||||
|               title = "Fungi Genomes - Classification", |               title = "Fungi Genomes - Classification", | ||||||
|               font = list(size = 10)) |               font = list(size = 10)) | ||||||
|  |  | ||||||
| fig     # plot the diagram | fig     # plot the diagram | ||||||
|  |  | ||||||
| # Note that the plot appears in the Viewer window, not the Plot window, and that | # Note that the plot appears in the Viewer window, not the Plot window, and that | ||||||
| # it is interactive: you can hover over nodes and links, and drag the nodes | # it is interactive: you can hover over nodes and links, and drag the nodes | ||||||
| # around. | # around. | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,234 +1,234 @@ | |||||||
| # tocID <- "BIN-PHYLO-Data_preparation.R" | # tocID <- "BIN-PHYLO-Data_preparation.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-PHYLO-Data_preparation unit. | #              R code accompanying the BIN-PHYLO-Data_preparation unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2020-09 | # Date:     2017-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Maintenance | #           1.2    2020 Maintenance | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #                      use Biocmanager:: not biocLite() | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0    First 2017 version | #           1.0    First 2017 version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                     Line | #TOC>   Section  Title                                     Line | ||||||
| #TOC> --------------------------------------------------------- | #TOC> --------------------------------------------------------- | ||||||
| #TOC>   1        Preparations                                45 | #TOC>   1        Preparations                                45 | ||||||
| #TOC>   2        Fetching sequences                          77 | #TOC>   2        Fetching sequences                          77 | ||||||
| #TOC>   3        Multiple Sequence Alignment                118 | #TOC>   3        Multiple Sequence Alignment                118 | ||||||
| #TOC>   4        Reviewing and Editing Alignments           137 | #TOC>   4        Reviewing and Editing Alignments           137 | ||||||
| #TOC>   4.1        Masking workflow                         153 | #TOC>   4.1        Masking workflow                         153 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Preparations  ======================================================== | # =    1  Preparations  ======================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # You need to reload your protein database, including changes that might have | # You need to reload your protein database, including changes that might have | ||||||
| # been made to the reference files. If you have worked with the prerequiste | # been made to the reference files. If you have worked with the prerequiste | ||||||
| # units, you should have a script named "makeProteinDB.R" that will create the | # units, you should have a script named "makeProteinDB.R" that will create the | ||||||
| # myDB object with a protein and feature database. Ask for advice if not. | # myDB object with a protein and feature database. Ask for advice if not. | ||||||
| source("myScripts/makeProteinDB.R") | source("myScripts/makeProteinDB.R") | ||||||
|  |  | ||||||
| # Load packages we need | # Load packages we need | ||||||
|  |  | ||||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   BiocManager::install("Biostrings") |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")    # available vignettes | #  browseVignettes("Biostrings")    # available vignettes | ||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! requireNamespace("msa", quietly = TRUE)) { | if (! requireNamespace("msa", quietly = TRUE)) { | ||||||
|   BiocManager::install("msa") |   BiocManager::install("msa") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = msa)       # basic information | #  library(help = msa)       # basic information | ||||||
| #  browseVignettes("msa")  # available vignettes | #  browseVignettes("msa")  # available vignettes | ||||||
| #  data(package = "msa")   # available datasets | #  data(package = "msa")   # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Fetching sequences  ================================================== | # =    2  Fetching sequences  ================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1 | # myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1 | ||||||
| # RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES | # RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES | ||||||
| # domains. You have annotated their ranges as a feature. The following code | # domains. You have annotated their ranges as a feature. The following code | ||||||
| # retrieves the sequences from myDB. You have seen similar code in other units. | # retrieves the sequences from myDB. You have seen similar code in other units. | ||||||
|  |  | ||||||
| sel <- grep("^MBP1_", myDB$protein$name) | sel <- grep("^MBP1_", myDB$protein$name) | ||||||
| (proNames <- myDB$protein$name[sel]) | (proNames <- myDB$protein$name[sel]) | ||||||
| (proIDs <- myDB$protein$ID[sel]) | (proIDs <- myDB$protein$ID[sel]) | ||||||
|  |  | ||||||
| (sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | (sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"]) | ||||||
| (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% ! | (fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% ! | ||||||
|                               myDB$annotation$featureID == sel])      #  ==  ! |                               myDB$annotation$featureID == sel])      #  ==  ! | ||||||
|                                                                       # Why? |                                                                       # Why? | ||||||
| APSI <- character(length(fanIDs)) | APSI <- character(length(fanIDs)) | ||||||
|  |  | ||||||
| for (i in seq_along(fanIDs)) { | for (i in seq_along(fanIDs)) { | ||||||
|   sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index |   sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index | ||||||
|   proID <- myDB$annotation$proteinID[sel]   # get its protein ID |   proID <- myDB$annotation$proteinID[sel]   # get its protein ID | ||||||
|   start <- myDB$annotation$start[sel]       # get start ... |   start <- myDB$annotation$start[sel]       # get start ... | ||||||
|   end   <- myDB$annotation$end[sel]         # ... and end |   end   <- myDB$annotation$end[sel]         # ... and end | ||||||
|  |  | ||||||
|   sel <- myDB$protein$ID == proID           # get the protein row index ... |   sel <- myDB$protein$ID == proID           # get the protein row index ... | ||||||
|                                             # ... and the sequence |                                             # ... and the sequence | ||||||
|   APSI[i] <- substring(myDB$protein$sequence[sel], start, end) |   APSI[i] <- substring(myDB$protein$sequence[sel], start, end) | ||||||
|   names(APSI)[i] <- (myDB$protein$name[sel]) |   names(APSI)[i] <- (myDB$protein$name[sel]) | ||||||
| } | } | ||||||
|  |  | ||||||
| head(APSI) | head(APSI) | ||||||
|  |  | ||||||
| # Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our | # Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our | ||||||
| # phylogenetic tree (see the unit's Wiki page for details on the sequence). | # phylogenetic tree (see the unit's Wiki page for details on the sequence). | ||||||
|  |  | ||||||
| APSI <- c(APSI, | APSI <- c(APSI, | ||||||
| "IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ") | "IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ") | ||||||
| names(APSI)[length(APSI)] <- "KILA_ESCCO" | names(APSI)[length(APSI)] <- "KILA_ESCCO" | ||||||
| tail(APSI) | tail(APSI) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Multiple Sequence Alignment  ========================================= | # =    3  Multiple Sequence Alignment  ========================================= | ||||||
|  |  | ||||||
| # This vector of sequences with named elements fulfills the requirements to be | # This vector of sequences with named elements fulfills the requirements to be | ||||||
| # imported as a Biostrings object - an AAStringSet - which we need as input for | # imported as a Biostrings object - an AAStringSet - which we need as input for | ||||||
| # the MSA algorithms in Biostrings. | # the MSA algorithms in Biostrings. | ||||||
| # | # | ||||||
|  |  | ||||||
| APSESSet <- Biostrings::AAStringSet(APSI) | APSESSet <- Biostrings::AAStringSet(APSI) | ||||||
| APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned") | APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned") | ||||||
|  |  | ||||||
| # Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If | # Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If | ||||||
| # that happens in your case, just use msaClustalOmega() instead. | # that happens in your case, just use msaClustalOmega() instead. | ||||||
|  |  | ||||||
| # inspect the alignment. | # inspect the alignment. | ||||||
| writeALN(APSESMsa) | writeALN(APSESMsa) | ||||||
|  |  | ||||||
| # What do you think? Is this a good alignment for phylogenetic inference? | # What do you think? Is this a good alignment for phylogenetic inference? | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Reviewing and Editing Alignments  ==================================== | # =    4  Reviewing and Editing Alignments  ==================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Head back to the Wiki page for this unit and read up on the background | # Head back to the Wiki page for this unit and read up on the background | ||||||
| # first. | # first. | ||||||
|  |  | ||||||
| # Let's mask out all columns that have observations for | # Let's mask out all columns that have observations for | ||||||
| # less than 1/3 of the sequences in the dataset. This | # less than 1/3 of the sequences in the dataset. This | ||||||
| # means they have more than round(nrow(msaSet) * (2/3)) | # means they have more than round(nrow(msaSet) * (2/3)) | ||||||
| # hyphens in a column. | # hyphens in a column. | ||||||
| # | # | ||||||
| # We take all sequences, split them into single | # We take all sequences, split them into single | ||||||
| # characters, and put them into a matrix. Then we | # characters, and put them into a matrix. Then we | ||||||
| # go through the matrix, column by column and decide | # go through the matrix, column by column and decide | ||||||
| # whether we want to include that column. | # whether we want to include that column. | ||||||
|  |  | ||||||
| # ==   4.1  Masking workflow  ================================================== | # ==   4.1  Masking workflow  ================================================== | ||||||
|  |  | ||||||
| # get the length of the alignment | # get the length of the alignment | ||||||
| (lenAli <- APSESMsa@unmasked@ranges@width[1]) | (lenAli <- APSESMsa@unmasked@ranges@width[1]) | ||||||
|  |  | ||||||
| # initialize a matrix that can hold all characters | # initialize a matrix that can hold all characters | ||||||
| # individually | # individually | ||||||
| msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli), | msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli), | ||||||
|                     ncol = lenAli) |                     ncol = lenAli) | ||||||
|  |  | ||||||
| # assign the correct rownames | # assign the correct rownames | ||||||
| rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES | rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES | ||||||
| for (i in 1:nrow(APSESMsa)) { | for (i in 1:nrow(APSESMsa)) { | ||||||
|   msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), "")) |   msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), "")) | ||||||
| } | } | ||||||
|  |  | ||||||
| # inspect the result | # inspect the result | ||||||
| msaMatrix[1:7, 30:40] | msaMatrix[1:7, 30:40] | ||||||
|  |  | ||||||
| # Now let's make a logical vector with an element for each column that selects | # Now let's make a logical vector with an element for each column that selects | ||||||
| # which columns should be masked out. | # which columns should be masked out. | ||||||
|  |  | ||||||
| # The number of hyphens in a column is easy to count. Consider: | # The number of hyphens in a column is easy to count. Consider: | ||||||
|  |  | ||||||
|     msaMatrix[ , 20]             # column 20 |     msaMatrix[ , 20]             # column 20 | ||||||
|     msaMatrix[ , 20] == "-"      # TRUE for all gap characters |     msaMatrix[ , 20] == "-"      # TRUE for all gap characters | ||||||
| sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE | sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE | ||||||
|  |  | ||||||
| # Thus filling our logical vector is simple: | # Thus filling our logical vector is simple: | ||||||
|  |  | ||||||
| # initialize a mask | # initialize a mask | ||||||
| colMask <- logical(ncol(msaMatrix)) | colMask <- logical(ncol(msaMatrix)) | ||||||
|  |  | ||||||
| # define the threshold for rejecting a column | # define the threshold for rejecting a column | ||||||
| limit <- round(nrow(APSESMsa) * (2/3)) | limit <- round(nrow(APSESMsa) * (2/3)) | ||||||
|  |  | ||||||
| # iterate over all columns, and write TRUE if there are less-or-equal to "limit" | # iterate over all columns, and write TRUE if there are less-or-equal to "limit" | ||||||
| # hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis | # hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis | ||||||
| # and FALSE columns will be rejected. | # and FALSE columns will be rejected. | ||||||
| for (i in 1:ncol(msaMatrix)) { | for (i in 1:ncol(msaMatrix)) { | ||||||
|   count <- sum(msaMatrix[ , i] == "-") |   count <- sum(msaMatrix[ , i] == "-") | ||||||
|   colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not |   colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not | ||||||
| } | } | ||||||
|  |  | ||||||
| # Inspect the mask | # Inspect the mask | ||||||
| colMask | colMask | ||||||
|  |  | ||||||
| # How many positions are being kept? | # How many positions are being kept? | ||||||
| sum(colMask) | sum(colMask) | ||||||
|  |  | ||||||
| cat(sprintf("We are masking %4.2f %% of alignment columns.\n", | cat(sprintf("We are masking %4.2f %% of alignment columns.\n", | ||||||
|             100 * (1 - (sum(colMask) / length(colMask))))) |             100 * (1 - (sum(colMask) / length(colMask))))) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Next, we use colMask to remove the masked columns from the matrix | # Next, we use colMask to remove the masked columns from the matrix | ||||||
| # in one step: | # in one step: | ||||||
| maskedMatrix <- msaMatrix[ , colMask] | maskedMatrix <- msaMatrix[ , colMask] | ||||||
|  |  | ||||||
| # check: | # check: | ||||||
| ncol(maskedMatrix) | ncol(maskedMatrix) | ||||||
|  |  | ||||||
| # ... then collapse each row of single characters back into a string ... | # ... then collapse each row of single characters back into a string ... | ||||||
| APSESphyloSet <- character() | APSESphyloSet <- character() | ||||||
| for (i in 1:nrow(maskedMatrix)) { | for (i in 1:nrow(maskedMatrix)) { | ||||||
|   APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="") |   APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="") | ||||||
| } | } | ||||||
| names(APSESphyloSet) <- rownames(maskedMatrix) | names(APSESphyloSet) <- rownames(maskedMatrix) | ||||||
|  |  | ||||||
| # inspect ... | # inspect ... | ||||||
| writeALN(APSESphyloSet) | writeALN(APSESphyloSet) | ||||||
|  |  | ||||||
| # As you see, we have removed a three residue insertion from MBP1_NEUCR, and | # As you see, we have removed a three residue insertion from MBP1_NEUCR, and | ||||||
| # several indels from the KILA_ESCCO outgroup sequence. | # several indels from the KILA_ESCCO outgroup sequence. | ||||||
|  |  | ||||||
|  |  | ||||||
| # We save the aligned, masked domains to a file in the data/ directory, | # We save the aligned, masked domains to a file in the data/ directory, | ||||||
| # in multi-FASTA format. | # in multi-FASTA format. | ||||||
| writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa") | writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,406 +1,406 @@ | |||||||
| # tocID <- "BIN-PHYLO-Tree_analysis.R" | # tocID <- "BIN-PHYLO-Tree_analysis.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-PHYLO-Tree_analysis unit. | #              R code accompanying the BIN-PHYLO-Tree_analysis unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2020-09 | # Date:     2017-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 updates. Deprecate iTol and use taxize:: instead. | #           1.2    2020 updates. Deprecate iTol and use taxize:: instead. | ||||||
| #                  Rewrite of tip re-ordering. Better handling of | #                  Rewrite of tip re-ordering. Better handling of | ||||||
| #                  messages. pBar() for randomization. | #                  messages. pBar() for randomization. | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #                      use Biocmanager:: not biocLite() | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0.2  Typo in variable name, style changes | #           1.0.2  Typo in variable name, style changes | ||||||
| #           1.0.1  Wrong section heading | #           1.0.1  Wrong section heading | ||||||
| #           1.0    First 2017 version | #           1.0    First 2017 version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                              Line | #TOC>   Section  Title                              Line | ||||||
| #TOC> -------------------------------------------------- | #TOC> -------------------------------------------------- | ||||||
| #TOC>   1        Preparation and Tree Plot            50 | #TOC>   1        Preparation and Tree Plot            50 | ||||||
| #TOC>   2        SPECIES REFERENCE TREE               66 | #TOC>   2        SPECIES REFERENCE TREE               66 | ||||||
| #TOC>   3        Tree Analysis                       117 | #TOC>   3        Tree Analysis                       117 | ||||||
| #TOC>   3.1        Rooting Trees                     177 | #TOC>   3.1        Rooting Trees                     177 | ||||||
| #TOC>   3.2        Rotating Clades                   222 | #TOC>   3.2        Rotating Clades                   222 | ||||||
| #TOC>   3.3        Computing tree distances          309 | #TOC>   3.3        Computing tree distances          309 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Preparation and Tree Plot  =========================================== | # =    1  Preparation and Tree Plot  =========================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! requireNamespace("ape", quietly = TRUE)) { | if (! requireNamespace("ape", quietly = TRUE)) { | ||||||
|   install.packages("ape") |   install.packages("ape") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = ape)       # basic information | #  library(help = ape)       # basic information | ||||||
| #  browseVignettes("ape")    # available vignettes | #  browseVignettes("ape")    # available vignettes | ||||||
| #  data(package = "ape")     # available datasets | #  data(package = "ape")     # available datasets | ||||||
|  |  | ||||||
| # We change the graphics parameters from time to time, let's define the | # We change the graphics parameters from time to time, let's define the | ||||||
| # default so we can recreate a sane state: | # default so we can recreate a sane state: | ||||||
| dev.off() | dev.off() | ||||||
| PAR <- par() | PAR <- par() | ||||||
|  |  | ||||||
| # =    2  SPECIES REFERENCE TREE  ============================================== | # =    2  SPECIES REFERENCE TREE  ============================================== | ||||||
|  |  | ||||||
| # Before we do any kind of phylogenetic analysis of genes from several species, | # Before we do any kind of phylogenetic analysis of genes from several species, | ||||||
| # we MUST have a reference tree of the taxonomic relationships in hand. This | # we MUST have a reference tree of the taxonomic relationships in hand. This | ||||||
| # context is absolutely required for the interpretation of our tree. | # context is absolutely required for the interpretation of our tree. | ||||||
|  |  | ||||||
| # We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package. | # We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package. | ||||||
|  |  | ||||||
| if (! requireNamespace("taxize", quietly = TRUE)) { | if (! requireNamespace("taxize", quietly = TRUE)) { | ||||||
|   install.packages("taxize") |   install.packages("taxize") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help   = taxize)       # basic information | #  library(help   = taxize)       # basic information | ||||||
| #  browseVignettes("taxize")    # available vignettes | #  browseVignettes("taxize")    # available vignettes | ||||||
| #  data(package  = "taxize")     # available datasets | #  data(package  = "taxize")     # available datasets | ||||||
|  |  | ||||||
| ( mySOI <- c(myDB$taxonomy$ID, "83333") ) | ( mySOI <- c(myDB$taxonomy$ID, "83333") ) | ||||||
| myClass <- taxize::classification(mySOI, db = "ncbi") | myClass <- taxize::classification(mySOI, db = "ncbi") | ||||||
| str(myClass) | str(myClass) | ||||||
|  |  | ||||||
| myClass[[1]] | myClass[[1]] | ||||||
|  |  | ||||||
| fungiTree <- taxize::class2tree(myClass, check = TRUE) | fungiTree <- taxize::class2tree(myClass, check = TRUE) | ||||||
| plot(fungiTree) | plot(fungiTree) | ||||||
|  |  | ||||||
| # The tree produced by taxize:: contains full length species names, | # The tree produced by taxize:: contains full length species names, | ||||||
| # but it would be more convenient if it had bicodes instead. Also, the actual | # but it would be more convenient if it had bicodes instead. Also, the actual | ||||||
| # tree is only part of the list(), which will cause problems later: | # tree is only part of the list(), which will cause problems later: | ||||||
| str(fungiTree) | str(fungiTree) | ||||||
|  |  | ||||||
| # we therefor simplify | # we therefor simplify | ||||||
| fungiTree <- fungiTree$phylo | fungiTree <- fungiTree$phylo | ||||||
| str(fungiTree) | str(fungiTree) | ||||||
|  |  | ||||||
| # The species names are in a vector $phylo$tip.label of this list. | # The species names are in a vector $phylo$tip.label of this list. | ||||||
| # We can use biCode() to shorten them. | # We can use biCode() to shorten them. | ||||||
| fungiTree$tip.label <- biCode(fungiTree$tip.label) | fungiTree$tip.label <- biCode(fungiTree$tip.label) | ||||||
|  |  | ||||||
| # Plot the tree | # Plot the tree | ||||||
| nSP <- length(fungiTree$tip.label) | nSP <- length(fungiTree$tip.label) | ||||||
| plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE) | plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE) | ||||||
| text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4) | text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4) | ||||||
| ape::nodelabels(text = fungiTree$node.label, | ape::nodelabels(text = fungiTree$node.label, | ||||||
|                 cex = 0.6, |                 cex = 0.6, | ||||||
|                 adj = 0.2, |                 adj = 0.2, | ||||||
|                 bg = "#D4F2DA") |                 bg = "#D4F2DA") | ||||||
| # Note that you can use the arrow buttons in the menu above the plot pane to | # Note that you can use the arrow buttons in the menu above the plot pane to | ||||||
| # scroll back to plots you have created earlier - so you can reference back to | # scroll back to plots you have created earlier - so you can reference back to | ||||||
| # this species tree in your later analysis. | # this species tree in your later analysis. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Tree Analysis  ======================================================= | # =    3  Tree Analysis  ======================================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| # 1.1  Visualizing your tree | # 1.1  Visualizing your tree | ||||||
| # The trees that are produced by Rphylip are stored as an object of class | # The trees that are produced by Rphylip are stored as an object of class | ||||||
| # "phylo". This is a class for phylogenetic trees that is widely used in the | # "phylo". This is a class for phylogenetic trees that is widely used in the | ||||||
| # community, practically all R phylogenetics packages will options to read and | # community, practically all R phylogenetics packages will options to read and | ||||||
| # manipulate such trees. Outside of R, a popular interchange format is the | # manipulate such trees. Outside of R, a popular interchange format is the | ||||||
| # Newick_format that you have seen above. It's easy to output your calculated | # Newick_format that you have seen above. It's easy to output your calculated | ||||||
| # trees in Newick format and visualize them elsewhere. | # trees in Newick format and visualize them elsewhere. | ||||||
|  |  | ||||||
| # The "phylo" class object is one of R's "S3" objects and methods to plot and | # The "phylo" class object is one of R's "S3" objects and methods to plot and | ||||||
| # print it have been defined with the Rphylip package, and in ape. You can | # print it have been defined with the Rphylip package, and in ape. You can | ||||||
| # simply call plot(<your-tree>) and R knows what to do with <your-tree> and how | # simply call plot(<your-tree>) and R knows what to do with <your-tree> and how | ||||||
| # to plot it. The underlying function is plot.phylo(), and documentation for its | # to plot it. The underlying function is plot.phylo(), and documentation for its | ||||||
| # many options can by found by typing: | # many options can by found by typing: | ||||||
|  |  | ||||||
| ?plot.phylo | ?plot.phylo | ||||||
|  |  | ||||||
| # We load the APSES sequence tree that you produced in the | # We load the APSES sequence tree that you produced in the | ||||||
| # BIN-PHYLO-Tree_building unit: | # BIN-PHYLO-Tree_building unit: | ||||||
| apsTree <- readRDS(file = "data/APSEStreeRproml.rds") | apsTree <- readRDS(file = "data/APSEStreeRproml.rds") | ||||||
|  |  | ||||||
| plot(apsTree) # default type is "phylogram" | plot(apsTree) # default type is "phylogram" | ||||||
| plot(apsTree, type = "unrooted") | plot(apsTree, type = "unrooted") | ||||||
| plot(apsTree, type = "fan", no.margin = TRUE) | plot(apsTree, type = "fan", no.margin = TRUE) | ||||||
|  |  | ||||||
| # rescale to show all of the labels: | # rescale to show all of the labels: | ||||||
| # record the current plot parameters by assigning them to a variable ... | # record the current plot parameters by assigning them to a variable ... | ||||||
| (tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE)) | (tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE)) | ||||||
| # ... and adjust the plot limits for a new plot: | # ... and adjust the plot limits for a new plot: | ||||||
| plot(apsTree, | plot(apsTree, | ||||||
|      type = "fan", |      type = "fan", | ||||||
|      x.lim = tmp$x.lim * 1.8, |      x.lim = tmp$x.lim * 1.8, | ||||||
|      y.lim = tmp$y.lim * 1.8, |      y.lim = tmp$y.lim * 1.8, | ||||||
|      cex = 0.8, |      cex = 0.8, | ||||||
|      no.margin = TRUE) |      no.margin = TRUE) | ||||||
|  |  | ||||||
| # Inspect the tree object | # Inspect the tree object | ||||||
| str(apsTree) | str(apsTree) | ||||||
| apsTree$tip.label | apsTree$tip.label | ||||||
| apsTree$edge | apsTree$edge | ||||||
| apsTree$edge.length | apsTree$edge.length | ||||||
|  |  | ||||||
| # show the node / edge and tip labels on a plot | # show the node / edge and tip labels on a plot | ||||||
| plot(apsTree) | plot(apsTree) | ||||||
| ape::nodelabels() | ape::nodelabels() | ||||||
| ape::edgelabels() | ape::edgelabels() | ||||||
| ape::tiplabels() | ape::tiplabels() | ||||||
|  |  | ||||||
| # show the number of nodes, edges and tips | # show the number of nodes, edges and tips | ||||||
| ape::Nnode(apsTree) | ape::Nnode(apsTree) | ||||||
| ape::Nedge(apsTree) | ape::Nedge(apsTree) | ||||||
| ape::Ntip(apsTree) | ape::Ntip(apsTree) | ||||||
|  |  | ||||||
| par(PAR)   # reset graphics state | par(PAR)   # reset graphics state | ||||||
|  |  | ||||||
| # Finally, write the tree to console in Newick format | # Finally, write the tree to console in Newick format | ||||||
| ape::write.tree(apsTree) | ape::write.tree(apsTree) | ||||||
|  |  | ||||||
| # ==   3.1  Rooting Trees  ===================================================== | # ==   3.1  Rooting Trees  ===================================================== | ||||||
|  |  | ||||||
| # In order to analyse the tree, it is helpful to root it first and reorder its | # In order to analyse the tree, it is helpful to root it first and reorder its | ||||||
| # clades. Contrary to documentation, Rproml() returns an unrooted tree. | # clades. Contrary to documentation, Rproml() returns an unrooted tree. | ||||||
|  |  | ||||||
| ape::is.rooted(apsTree) | ape::is.rooted(apsTree) | ||||||
|  |  | ||||||
| # You can root the tree with the command root() from the "ape" package. | # You can root the tree with the command root() from the "ape" package. | ||||||
|  |  | ||||||
| plot(apsTree) | plot(apsTree) | ||||||
|  |  | ||||||
| # add labels for internal nodes and tips | # add labels for internal nodes and tips | ||||||
| ape::nodelabels(cex = 0.5, frame = "circle") | ape::nodelabels(cex = 0.5, frame = "circle") | ||||||
| ape::tiplabels(cex = 0.5, frame = "rect") | ape::tiplabels(cex = 0.5, frame = "rect") | ||||||
|  |  | ||||||
| # The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different | # The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different | ||||||
| # number in yours. Substitute the correct node number below for "outgroup". | # number in yours. Substitute the correct node number below for "outgroup". | ||||||
| apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE) | apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE) | ||||||
| plot(apsTree) | plot(apsTree) | ||||||
| ape::is.rooted(apsTree) | ape::is.rooted(apsTree) | ||||||
|  |  | ||||||
| # This tree _looks_ unchanged, beacuse when the root trifurcation was resolved, | # This tree _looks_ unchanged, beacuse when the root trifurcation was resolved, | ||||||
| # an edge of length zero was added to connect the MRCA (Most Recent Common | # an edge of length zero was added to connect the MRCA (Most Recent Common | ||||||
| # Ancestor) of the ingroup. | # Ancestor) of the ingroup. | ||||||
|  |  | ||||||
| # The edge lengths are stored in the phylo object: | # The edge lengths are stored in the phylo object: | ||||||
| apsTree$edge.length | apsTree$edge.length | ||||||
|  |  | ||||||
| # ... and you can assign a small arbitrary value to the edge | # ... and you can assign a small arbitrary value to the edge | ||||||
| # to show how it connects to the tree without having an | # to show how it connects to the tree without having an | ||||||
| # overlap. | # overlap. | ||||||
| apsTree$edge.length[1] <- 0.1 | apsTree$edge.length[1] <- 0.1 | ||||||
| plot(apsTree, cex = 0.7) | plot(apsTree, cex = 0.7) | ||||||
| ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") | ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") | ||||||
|  |  | ||||||
|  |  | ||||||
| # This procedure does however not assign an actual length to a root edge, and | # This procedure does however not assign an actual length to a root edge, and | ||||||
| # therefore no root edge is visible on the plot. Why? , you might ask. I ask | # therefore no root edge is visible on the plot. Why? , you might ask. I ask | ||||||
| # myself that too. We'll just add a length by hand. | # myself that too. We'll just add a length by hand. | ||||||
|  |  | ||||||
| apsTree$root.edge <- mean(apsTree$edge.length) * 1.5 | apsTree$root.edge <- mean(apsTree$edge.length) * 1.5 | ||||||
| plot(apsTree, cex = 0.7, root.edge = TRUE) | plot(apsTree, cex = 0.7, root.edge = TRUE) | ||||||
| ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") | ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.2  Rotating Clades  =================================================== | # ==   3.2  Rotating Clades  =================================================== | ||||||
|  |  | ||||||
| # To interpret the tree, it is useful to rotate the clades so that they appear | # To interpret the tree, it is useful to rotate the clades so that they appear | ||||||
| # in the order expected from the cladogram of species. | # in the order expected from the cladogram of species. | ||||||
|  |  | ||||||
| # We can either rotate around individual internal nodes ... | # We can either rotate around individual internal nodes ... | ||||||
| layout(matrix(1:2, 1, 2)) | layout(matrix(1:2, 1, 2)) | ||||||
| plot(apsTree, no.margin = TRUE, root.edge = TRUE) | plot(apsTree, no.margin = TRUE, root.edge = TRUE) | ||||||
| ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866") | ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866") | ||||||
| plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE) | plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE) | ||||||
| ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66") | ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66") | ||||||
| # Note that the species at the bottom of the clade descending from node | # Note that the species at the bottom of the clade descending from node | ||||||
| # 17 is now plotted at the top. | # 17 is now plotted at the top. | ||||||
|  |  | ||||||
| par(PAR)   # reset graphics state | par(PAR)   # reset graphics state | ||||||
|  |  | ||||||
| # ... or we can rearrange the tree so it corresponds as well as possible to a | # ... or we can rearrange the tree so it corresponds as well as possible to a | ||||||
| # predefined tip ordering. Here we use the ordering that taxize:: has inferred | # predefined tip ordering. Here we use the ordering that taxize:: has inferred | ||||||
| # from the NCBI taxonomic classification. | # from the NCBI taxonomic classification. | ||||||
|  |  | ||||||
| nOrg <- length(apsTree$tip.label) | nOrg <- length(apsTree$tip.label) | ||||||
|  |  | ||||||
| plot(fungiTree, | plot(fungiTree, | ||||||
|      no.margin = FALSE, root.edge = TRUE) |      no.margin = FALSE, root.edge = TRUE) | ||||||
| ape::nodelabels(text = fungiTree$node.label, | ape::nodelabels(text = fungiTree$node.label, | ||||||
|                 cex = 0.5, |                 cex = 0.5, | ||||||
|                 adj = 0.2, |                 adj = 0.2, | ||||||
|                 bg = "#D4F2DA") |                 bg = "#D4F2DA") | ||||||
|  |  | ||||||
| # These are the fungi tree tips ... | # These are the fungi tree tips ... | ||||||
| fungiTree$tip.label | fungiTree$tip.label | ||||||
| # ... and their order is determined by the edge-list that is stored in | # ... and their order is determined by the edge-list that is stored in | ||||||
| fungiTree$edge | fungiTree$edge | ||||||
| # which edges join the tips? | # which edges join the tips? | ||||||
| ape::tiplabels(cex = 0.5, frame = "rect") | ape::tiplabels(cex = 0.5, frame = "rect") | ||||||
| # as you can see, the tips (range [1:nOrg] ) are in column 2 and they are | # as you can see, the tips (range [1:nOrg] ) are in column 2 and they are | ||||||
| # ordered from bottom to top. | # ordered from bottom to top. | ||||||
| # And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ... | # And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ... | ||||||
|  |  | ||||||
| sel <- fungiTree$edge[ , 2 ] <= nOrg | sel <- fungiTree$edge[ , 2 ] <= nOrg | ||||||
| ( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] ) | ( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] ) | ||||||
|  |  | ||||||
| # Now, here are the genes of the apsTree tips ... | # Now, here are the genes of the apsTree tips ... | ||||||
| apsTree$tip.label | apsTree$tip.label | ||||||
|  |  | ||||||
| # ... and the "constraint"  we need for reordering, according to the help page | # ... and the "constraint"  we need for reordering, according to the help page | ||||||
| # of ape::rotateConstr(), is "a vector specifying the order of the tips as they | # of ape::rotateConstr(), is "a vector specifying the order of the tips as they | ||||||
| # should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector | # should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector | ||||||
| oSp <- gsub("^", "MBP1_", oSp) | oSp <- gsub("^", "MBP1_", oSp) | ||||||
| ( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) ) | ( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) ) | ||||||
|  |  | ||||||
| # Then we can plot the two trees to compare: the fungi- tree | # Then we can plot the two trees to compare: the fungi- tree | ||||||
| par(PAR)   # reset graphics state | par(PAR)   # reset graphics state | ||||||
| layout(matrix(1:2, 1, 2)) | layout(matrix(1:2, 1, 2)) | ||||||
| plot(fungiTree, | plot(fungiTree, | ||||||
|     no.margin = TRUE, |     no.margin = TRUE, | ||||||
|      root.edge = TRUE) |      root.edge = TRUE) | ||||||
| ape::nodelabels(text = fungiTree$node.label, | ape::nodelabels(text = fungiTree$node.label, | ||||||
|                 cex = 0.5, |                 cex = 0.5, | ||||||
|                 adj = 0.2, |                 adj = 0.2, | ||||||
|                 bg = "#D4F2DA") |                 bg = "#D4F2DA") | ||||||
|  |  | ||||||
| # and the re-organized apsesTree ... | # and the re-organized apsesTree ... | ||||||
| plot(ape::rotateConstr(apsTree, constraint = oSp[]), | plot(ape::rotateConstr(apsTree, constraint = oSp[]), | ||||||
|      no.margin = TRUE, |      no.margin = TRUE, | ||||||
|      root.edge = TRUE) |      root.edge = TRUE) | ||||||
|  |  | ||||||
| par(PAR)   # reset graphics state | par(PAR)   # reset graphics state | ||||||
|  |  | ||||||
| # As you can see, the reordering is not perfect, since the topologies are | # As you can see, the reordering is not perfect, since the topologies are | ||||||
| # different, mostly due to the unresolved nodes in the reference tree. One | # different, mostly due to the unresolved nodes in the reference tree. One | ||||||
| # could play with that ... | # could play with that ... | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: Study the two trees and consider their similarities and differences. | # Task: Study the two trees and consider their similarities and differences. | ||||||
| #         What do you expect? What do you find? Note that this is not a "mixed" | #         What do you expect? What do you find? Note that this is not a "mixed" | ||||||
| #         gene tree yet, since it contains only a single gene for the species | #         gene tree yet, since it contains only a single gene for the species | ||||||
| #         we considered. All of the branch points in this tree are speciation | #         we considered. All of the branch points in this tree are speciation | ||||||
| #         events. Thus the gene tree should have the same topology as the | #         events. Thus the gene tree should have the same topology as the | ||||||
| #         species tree. Does it? Are the differences important? How many | #         species tree. Does it? Are the differences important? How many | ||||||
| #         branches would you need to remove and reinsert elsewhere to get the | #         branches would you need to remove and reinsert elsewhere to get the | ||||||
| #         same topology as the species tree? | #         same topology as the species tree? | ||||||
|  |  | ||||||
| # In order to quantify how different these two trees are, we need to compute | # In order to quantify how different these two trees are, we need to compute | ||||||
| # tree distances. | # tree distances. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.3  Computing tree distances  ========================================== | # ==   3.3  Computing tree distances  ========================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Many superb phylogeny tools are contributed by the phangorn package. | # Many superb phylogeny tools are contributed by the phangorn package. | ||||||
|  |  | ||||||
| if (! requireNamespace("phangorn", quietly = TRUE)) { | if (! requireNamespace("phangorn", quietly = TRUE)) { | ||||||
|   install.packages("phangorn") |   install.packages("phangorn") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = phangorn)       # basic information | #  library(help = phangorn)       # basic information | ||||||
| #  browseVignettes("phangorn")    # available vignettes | #  browseVignettes("phangorn")    # available vignettes | ||||||
| #  data(package = "phangorn")     # available datasets | #  data(package = "phangorn")     # available datasets | ||||||
|  |  | ||||||
| # To compare two trees, they must have the same tip labels. We delete "MBP1_" or | # To compare two trees, they must have the same tip labels. We delete "MBP1_" or | ||||||
| # "KILA_" from the existing tip labels in a copy of our APSES domain tree. | # "KILA_" from the existing tip labels in a copy of our APSES domain tree. | ||||||
| apsTree2 <- apsTree | apsTree2 <- apsTree | ||||||
| apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label) | apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label) | ||||||
|  |  | ||||||
|  |  | ||||||
| # phangorn provides several functions to compute tree-differences (and there | # phangorn provides several functions to compute tree-differences (and there | ||||||
| # is a _whole_ lot of theory on how to compare trees). treedist() returns the | # is a _whole_ lot of theory on how to compare trees). treedist() returns the | ||||||
| # "symmetric difference" | # "symmetric difference" | ||||||
| phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE) | phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE) | ||||||
|  |  | ||||||
| # Numbers. What do they mean? How much more similar is our apsTree to the | # Numbers. What do they mean? How much more similar is our apsTree to the | ||||||
| # (presumably) ground truth of fungiTree than a random tree would be? | # (presumably) ground truth of fungiTree than a random tree would be? | ||||||
| # The ape package provides the function rtree() | # The ape package provides the function rtree() | ||||||
| # to compute random trees. | # to compute random trees. | ||||||
|  |  | ||||||
| ape::rtree(n = length(apsTree2$tip.label), # number of tips | ape::rtree(n = length(apsTree2$tip.label), # number of tips | ||||||
|           rooted = TRUE,                   # we rooted the tree above, |           rooted = TRUE,                   # we rooted the tree above, | ||||||
|                                            #  and fungiTree is rooted anyway |                                            #  and fungiTree is rooted anyway | ||||||
|           tip.label = apsTree2$tip.label,  # use the apsTree2 labels |           tip.label = apsTree2$tip.label,  # use the apsTree2 labels | ||||||
|           br = NULL)                       # don't generate branch lengths since |           br = NULL)                       # don't generate branch lengths since | ||||||
|                                            #   fungiTree has none, so we can't |                                            #   fungiTree has none, so we can't | ||||||
|                                            #   compare them anyway. |                                            #   compare them anyway. | ||||||
|  |  | ||||||
| # (Note the warning message about non-binary trees; we'll suppress that later | # (Note the warning message about non-binary trees; we'll suppress that later | ||||||
| #  by wrapping the function call in supressMessages(); we don't want to | #  by wrapping the function call in supressMessages(); we don't want to | ||||||
| #  print it 10,000 times :-) | #  print it 10,000 times :-) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Let's compute some random trees this way, calculate the distances to | # Let's compute some random trees this way, calculate the distances to | ||||||
| # fungiTree, and then compare the values we get for apsTree2. The random | # fungiTree, and then compare the values we get for apsTree2. The random | ||||||
| # trees are provided by ape::rtree(). | # trees are provided by ape::rtree(). | ||||||
|  |  | ||||||
| N <- 10000  # takes about 15 seconds, and we'll use the pBar function, | N <- 10000  # takes about 15 seconds, and we'll use the pBar function, | ||||||
|             # defined in .utilities.R  to keep track of where we are at: |             # defined in .utilities.R  to keep track of where we are at: | ||||||
| myTreeDistances <- matrix(numeric(N * 2), ncol = 2) | myTreeDistances <- matrix(numeric(N * 2), ncol = 2) | ||||||
| colnames(myTreeDistances) <- c("symm", "path") | colnames(myTreeDistances) <- c("symm", "path") | ||||||
|  |  | ||||||
| set.seed(112358) | set.seed(112358) | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   pBar(i, N) |   pBar(i, N) | ||||||
|   xTree <- ape::rtree(n = length(apsTree2$tip.label), |   xTree <- ape::rtree(n = length(apsTree2$tip.label), | ||||||
|                       rooted = TRUE, |                       rooted = TRUE, | ||||||
|                       tip.label = apsTree2$tip.label, |                       tip.label = apsTree2$tip.label, | ||||||
|                       br = NULL) |                       br = NULL) | ||||||
|   myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree)) |   myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree)) | ||||||
| } | } | ||||||
| set.seed(NULL)                      # reset the random number generator | set.seed(NULL)                      # reset the random number generator | ||||||
|  |  | ||||||
| table(myTreeDistances[, "symm"]) | table(myTreeDistances[, "symm"]) | ||||||
|  |  | ||||||
| ( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] ) | ( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] ) | ||||||
|  |  | ||||||
| # Random events less-or-equal to observation, divided by total number of | # Random events less-or-equal to observation, divided by total number of | ||||||
| # events gives us the empirical p-value. | # events gives us the empirical p-value. | ||||||
| cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n", | cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n", | ||||||
|             (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1))) |             (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1))) | ||||||
|  |  | ||||||
| par(PAR)   # reset graphics state | par(PAR)   # reset graphics state | ||||||
| hist(myTreeDistances[, "path"], | hist(myTreeDistances[, "path"], | ||||||
|      col = "aliceblue", |      col = "aliceblue", | ||||||
|      main = "Distances of random Trees to fungiTree") |      main = "Distances of random Trees to fungiTree") | ||||||
| (pathObs <- phangorn::treedist(fungiTree, apsTree2)[2]) | (pathObs <- phangorn::treedist(fungiTree, apsTree2)[2]) | ||||||
| abline(v = pathObs, col = "chartreuse") | abline(v = pathObs, col = "chartreuse") | ||||||
|  |  | ||||||
| # Random events less-or-equal to observation, divided by total number of | # Random events less-or-equal to observation, divided by total number of | ||||||
| # events gives us the empirical p-value. | # events gives us the empirical p-value. | ||||||
| cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n", | cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n", | ||||||
|             (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1))) |             (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1))) | ||||||
|  |  | ||||||
| # Indeed, our apsTree is _very_ much more similar to the species tree than | # Indeed, our apsTree is _very_ much more similar to the species tree than | ||||||
| # we would expect by random chance. | # we would expect by random chance. | ||||||
|  |  | ||||||
| # What do we gain from that analysis? Analyzing the tree we get from a single | # What do we gain from that analysis? Analyzing the tree we get from a single | ||||||
| # gene of orthologous sequences is a positive control in our computational | # gene of orthologous sequences is a positive control in our computational | ||||||
| # experiment. If these genes are indeed orthologues, a correct tree-building | # experiment. If these genes are indeed orthologues, a correct tree-building | ||||||
| # program ought to give us a tree that exactly matches the species tree. | # program ought to give us a tree that exactly matches the species tree. | ||||||
| # Evaluating how far off we are from the known correct result gives us a way to | # Evaluating how far off we are from the known correct result gives us a way to | ||||||
| # validate our workflow and our algorithm. If we can't get that right, we can't | # validate our workflow and our algorithm. If we can't get that right, we can't | ||||||
| # expect to get "real" data right either. Employing such positive controls in | # expect to get "real" data right either. Employing such positive controls in | ||||||
| # every computational experiment is essential for research. Not doing so is | # every computational experiment is essential for research. Not doing so is | ||||||
| # Cargo Cult Bioinformatics. | # Cargo Cult Bioinformatics. | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,168 +1,168 @@ | |||||||
| # tocID <- "BIN-PHYLO-Tree_building.R" | # tocID <- "BIN-PHYLO-Tree_building.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-PHYLO-Tree_building unit. | #              R code accompanying the BIN-PHYLO-Tree_building unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017-10   2020-09 | # Date:     2017-10   2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac: | #           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac: | ||||||
| #                  instructions to authorize proml.app | #                  instructions to authorize proml.app | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #           1.0    First 2017 version | #           1.0    First 2017 version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| #           Add MrBayes | #           Add MrBayes | ||||||
| # https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html | # https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                       Line | #TOC>   Section  Title                                       Line | ||||||
| #TOC> ----------------------------------------------------------- | #TOC> ----------------------------------------------------------- | ||||||
| #TOC>   1        Calculating Trees                             48 | #TOC>   1        Calculating Trees                             48 | ||||||
| #TOC>   1.1        PROMLPATH ...                               68 | #TOC>   1.1        PROMLPATH ...                               68 | ||||||
| #TOC>   1.1.1          ... on the Mac                          73 | #TOC>   1.1.1          ... on the Mac                          73 | ||||||
| #TOC>   1.1.2          ... on Windows                         101 | #TOC>   1.1.2          ... on Windows                         101 | ||||||
| #TOC>   1.1.3          ... on Linux                           115 | #TOC>   1.1.3          ... on Linux                           115 | ||||||
| #TOC>   1.1.4          Confirming PROMLPATH                   120 | #TOC>   1.1.4          Confirming PROMLPATH                   120 | ||||||
| #TOC>   1.2        Building a maximum likelihood tree         134 | #TOC>   1.2        Building a maximum likelihood tree         134 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Calculating Trees  =================================================== | # =    1  Calculating Trees  =================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Follow the instructions found at phylip's home on the Web to install. If you | # Follow the instructions found at phylip's home on the Web to install. If you | ||||||
| # are on a Windows computer, take note of the installation directory. | # are on a Windows computer, take note of the installation directory. | ||||||
|  |  | ||||||
| # After you have installed Phylip on your computer, install the R package that | # After you have installed Phylip on your computer, install the R package that | ||||||
| # provides an interface to the Phylip functions. | # provides an interface to the Phylip functions. | ||||||
|  |  | ||||||
| if (! requireNamespace("Rphylip", quietly = TRUE)) { | if (! requireNamespace("Rphylip", quietly = TRUE)) { | ||||||
|   install.packages("Rphylip") |   install.packages("Rphylip") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Rphylip)       # basic information | #  library(help = Rphylip)       # basic information | ||||||
| #  browseVignettes("Rphylip")    # available vignettes | #  browseVignettes("Rphylip")    # available vignettes | ||||||
| #  data(package = "Rphylip")     # available datasets | #  data(package = "Rphylip")     # available datasets | ||||||
|  |  | ||||||
| # This will install RPhylip, as well as its dependency, the package "ape". | # This will install RPhylip, as well as its dependency, the package "ape". | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.1  PROMLPATH ...  ===================================================== | # ==   1.1  PROMLPATH ...  ===================================================== | ||||||
| # The next part may be tricky. You will need to figure out where | # The next part may be tricky. You will need to figure out where | ||||||
| # on your computer Phylip has been installed and define the path | # on your computer Phylip has been installed and define the path | ||||||
| # to the proml program that calculates a maximum-likelihood tree. | # to the proml program that calculates a maximum-likelihood tree. | ||||||
|  |  | ||||||
| # ===   1.1.1  ... on the Mac                     | # ===   1.1.1  ... on the Mac                     | ||||||
| # On the Mac, the standard installation places a phylip folder | # On the Mac, the standard installation places a phylip folder | ||||||
| # in the /Applications directory. That folder contains all the | # in the /Applications directory. That folder contains all the | ||||||
| # individual phylip programs as <name>.app files. These are not | # individual phylip programs as <name>.app files. These are not | ||||||
| # the actual executables, but "app" files are actually directories | # the actual executables, but "app" files are actually directories | ||||||
| # that contain the required resources for a program to run. | # that contain the required resources for a program to run. | ||||||
|  |  | ||||||
| # The executable is in a subdirectory and you can point Rphylip | # The executable is in a subdirectory and you can point Rphylip | ||||||
| # directly to that subdirectory to find the program it needs: | # directly to that subdirectory to find the program it needs: | ||||||
| # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS" | # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS" | ||||||
|  |  | ||||||
| # However, RPHYLIP will not be able to run PHYLIP applications immediately, | # However, RPHYLIP will not be able to run PHYLIP applications immediately, | ||||||
| # because they have not been "signed" by the PHYLIP developers. The process | # because they have not been "signed" by the PHYLIP developers. The process | ||||||
| # will terminate by your system, with a warning. | # will terminate by your system, with a warning. | ||||||
|  |  | ||||||
| #   -  Navigate to the phylip folder in your ~/Applications directory | #   -  Navigate to the phylip folder in your ~/Applications directory | ||||||
| #   -  Descend into the "exe" folder and find  proml.app | #   -  Descend into the "exe" folder and find  proml.app | ||||||
| #   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that | #   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that | ||||||
| #      says: "macOS cannot verify the developer of “proml.app”. | #      says: "macOS cannot verify the developer of “proml.app”. | ||||||
| #             Are you sure you want to open it?" | #             Are you sure you want to open it?" | ||||||
| #   -  Click open to continue. You may need to allow access to the terminal | #   -  Click open to continue. You may need to allow access to the terminal | ||||||
| #      as well. When the proml terminal session open, you can type | #      as well. When the proml terminal session open, you can type | ||||||
| #      Ctrl-c to abort the program and close the window. | #      Ctrl-c to abort the program and close the window. | ||||||
| # | # | ||||||
| #   This adds proml.app to the list of known-good programs and you will not | #   This adds proml.app to the list of known-good programs and you will not | ||||||
| #   need to repeat this process. | #   need to repeat this process. | ||||||
| # | # | ||||||
|  |  | ||||||
| # ===   1.1.2  ... on Windows                     | # ===   1.1.2  ... on Windows                     | ||||||
| # On Windows you need to know where the programs have been installed, and you | # On Windows you need to know where the programs have been installed, and you | ||||||
| # need to specify a path that is correct for the Windows OS. Find the folder | # need to specify a path that is correct for the Windows OS. Find the folder | ||||||
| # that is named "exe", and right-click to inspect its properties. The path | # that is named "exe", and right-click to inspect its properties. The path | ||||||
| # should be listed among them. | # should be listed among them. | ||||||
|  |  | ||||||
| # If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your | # If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your | ||||||
| # assignment has to be | # assignment has to be | ||||||
| # PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe" | # PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe" | ||||||
| # (Note: "/", not "\") | # (Note: "/", not "\") | ||||||
|  |  | ||||||
| # I have heard that your path must not contain spaces, and it is prudent to | # I have heard that your path must not contain spaces, and it is prudent to | ||||||
| # avoid other special characters as well. | # avoid other special characters as well. | ||||||
|  |  | ||||||
| # ===   1.1.3  ... on Linux                       | # ===   1.1.3  ... on Linux                       | ||||||
| # If you are running Linux I trust you know what to do. It's probably | # If you are running Linux I trust you know what to do. It's probably | ||||||
| # something like | # something like | ||||||
| # PROMLPATH <- "/usr/local/phylip-3.695/bin" | # PROMLPATH <- "/usr/local/phylip-3.695/bin" | ||||||
|  |  | ||||||
| # ===   1.1.4  Confirming PROMLPATH               | # ===   1.1.4  Confirming PROMLPATH               | ||||||
| # Confirm that the settings are right. | # Confirm that the settings are right. | ||||||
| PROMLPATH                # returns the path | PROMLPATH                # returns the path | ||||||
| list.dirs(PROMLPATH)     # returns the directories in that path | list.dirs(PROMLPATH)     # returns the directories in that path | ||||||
| list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command" | list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command" | ||||||
|  |  | ||||||
| # If "proml" is NOT among the files that the last command returns, you | # If "proml" is NOT among the files that the last command returns, you | ||||||
| # can't continue. Ask on the mailing list for advice. | # can't continue. Ask on the mailing list for advice. | ||||||
|  |  | ||||||
| # If everything is good, you can add the line that defines PROMLPATH to | # If everything is good, you can add the line that defines PROMLPATH to | ||||||
| # myScripts/.myProfile.R - the path will then be automatically set when | # myScripts/.myProfile.R - the path will then be automatically set when | ||||||
| # you quit RStudio and return. | # you quit RStudio and return. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.2  Building a maximum likelihood tree  ================================ | # ==   1.2  Building a maximum likelihood tree  ================================ | ||||||
| # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit, | # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit, | ||||||
| # as a "proseq" object with the read.protein() function of the RPhylip package: | # as a "proseq" object with the read.protein() function of the RPhylip package: | ||||||
|  |  | ||||||
| apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa") | apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa") | ||||||
| str(apsIn) | str(apsIn) | ||||||
|  |  | ||||||
| # ... and you are ready to build a tree. | # ... and you are ready to build a tree. | ||||||
|  |  | ||||||
| # There are many fast options in PHYLIP - we will use the most _accurate_ one | # There are many fast options in PHYLIP - we will use the most _accurate_ one | ||||||
| # that it has: proml, a maximum-likelihood tree building program for protein | # that it has: proml, a maximum-likelihood tree building program for protein | ||||||
| # data. | # data. | ||||||
|  |  | ||||||
| # Building maximum-likelihood trees can eat as much computer time | # Building maximum-likelihood trees can eat as much computer time | ||||||
| # as you can throw at it. Calculating a tree of 48 APSES domains | # as you can throw at it. Calculating a tree of 48 APSES domains | ||||||
| # with default parameters of Rproml() runs for more than half a day | # with default parameters of Rproml() runs for more than half a day | ||||||
| # on my computer. But we have only twelve sequences here, so the | # on my computer. But we have only twelve sequences here, so the | ||||||
| # process will take us about 5 to 15 minutes. Run this, and anjoy a good cup | # process will take us about 5 to 15 minutes. Run this, and anjoy a good cup | ||||||
| # of coffee while you are waiting. | # of coffee while you are waiting. | ||||||
|  |  | ||||||
| apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH) | apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH) | ||||||
|  |  | ||||||
| # A quick first look: | # A quick first look: | ||||||
|  |  | ||||||
| plot(apsTree) | plot(apsTree) | ||||||
|  |  | ||||||
| # save your tree: | # save your tree: | ||||||
| saveRDS(apsTree, file = "data/APSEStreeRproml.rds") | saveRDS(apsTree, file = "data/APSEStreeRproml.rds") | ||||||
|  |  | ||||||
| # If this did not work, ask for advice. | # If this did not work, ask for advice. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,323 +1,323 @@ | |||||||
| # tocID <- "BIN-PPI-Analysis.R" | # tocID <- "BIN-PPI-Analysis.R" | ||||||
| # | # | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-PPI-Analysis unit. | #              R code accompanying the BIN-PPI-Analysis unit. | ||||||
| # | # | ||||||
| # Version:   1.4 | # Version:   1.4 | ||||||
| # | # | ||||||
| # Date:     2017-08  -  2020-10 | # Date:     2017-08  -  2020-10 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.4    Update vector ID's for betweenness centrality. | #           1.4    Update vector ID's for betweenness centrality. | ||||||
| #           1.3    Bugfix: called the wrong function on ENSPsel in l. 220 | #           1.3    Bugfix: called the wrong function on ENSPsel in l. 220 | ||||||
| #           1.2    2020 Updates; Rewrite for new STRINg V11; | #           1.2    2020 Updates; Rewrite for new STRINg V11; | ||||||
| #                  Deprecate save()/load() for saveRDS()/readRDS() | #                  Deprecate save()/load() for saveRDS()/readRDS() | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #                      use Biocmanager:: not biocLite() | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0    First live version | #           1.0    First live version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC> | #TOC> | ||||||
| #TOC>   Section  Title                                           Line | #TOC>   Section  Title                                           Line | ||||||
| #TOC> --------------------------------------------------------------- | #TOC> --------------------------------------------------------------- | ||||||
| #TOC>   1        Setup and data                                    50 | #TOC>   1        Setup and data                                    50 | ||||||
| #TOC>   2        Functional Edges in the Human Proteome            86 | #TOC>   2        Functional Edges in the Human Proteome            86 | ||||||
| #TOC>   2.1        Cliques                                        129 | #TOC>   2.1        Cliques                                        129 | ||||||
| #TOC>   2.2        Communities                                    170 | #TOC>   2.2        Communities                                    170 | ||||||
| #TOC>   2.3        Betweenness Centrality                         184 | #TOC>   2.3        Betweenness Centrality                         184 | ||||||
| #TOC>   3        biomaRt                                          231 | #TOC>   3        biomaRt                                          231 | ||||||
| #TOC>   4        Task for submission                              302 | #TOC>   4        Task for submission                              302 | ||||||
| #TOC> | #TOC> | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Setup and data  ====================================================== | # =    1  Setup and data  ====================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Not surprisingly, the analysis of PPI networks needs iGraph: | # Not surprisingly, the analysis of PPI networks needs iGraph: | ||||||
|  |  | ||||||
| if (! requireNamespace("igraph", quietly = TRUE)) { | if (! requireNamespace("igraph", quietly = TRUE)) { | ||||||
|   install.packages("igraph") |   install.packages("igraph") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = igraph)       # basic information | #  library(help = igraph)       # basic information | ||||||
| #  browseVignettes("igraph")    # available vignettes | #  browseVignettes("igraph")    # available vignettes | ||||||
| #  data(package = "igraph")     # available datasets | #  data(package = "igraph")     # available datasets | ||||||
|  |  | ||||||
| # In order for you to explore some real, biological networks, I give you a | # In order for you to explore some real, biological networks, I give you a | ||||||
| # dataframe of functional relationships of human proteins that I have downloaded | # dataframe of functional relationships of human proteins that I have downloaded | ||||||
| # from the STRING database. The full table has 8.5 million records, here is a | # from the STRING database. The full table has 8.5 million records, here is a | ||||||
| # subset of records with combined confidence scores > 980 | # subset of records with combined confidence scores > 980 | ||||||
|  |  | ||||||
| # The selected set of edges with a confidence of > 964 is a dataframe with about | # The selected set of edges with a confidence of > 964 is a dataframe with about | ||||||
| # 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of | # 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of | ||||||
| # a fungal proteome. You can load the saved dataframe here (To read more about | # a fungal proteome. You can load the saved dataframe here (To read more about | ||||||
| # what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ). | # what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ). | ||||||
|  |  | ||||||
| STRINGedges <- readRDS("./data/STRINGedges.rds") | STRINGedges <- readRDS("./data/STRINGedges.rds") | ||||||
|  |  | ||||||
| head(STRINGedges) | head(STRINGedges) | ||||||
|  |  | ||||||
| # Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the | # Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the | ||||||
| # Ensemble transcript identifiers that start with ENSP. We'll remove them: | # Ensemble transcript identifiers that start with ENSP. We'll remove them: | ||||||
|  |  | ||||||
| STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a) | STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a) | ||||||
| STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b) | STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b) | ||||||
|  |  | ||||||
| head(STRINGedges) | head(STRINGedges) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Functional Edges in the Human Proteome  ============================== | # =    2  Functional Edges in the Human Proteome  ============================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # There are many possibilities to explore interesting aspects of biological | # There are many possibilities to explore interesting aspects of biological | ||||||
| # networks, we will keep with some very simple procedures here but you have | # networks, we will keep with some very simple procedures here but you have | ||||||
| # to be aware that this is barely scratching the surface of possibilities. | # to be aware that this is barely scratching the surface of possibilities. | ||||||
| # However, once the network exists in your computer, it is comparatively | # However, once the network exists in your computer, it is comparatively | ||||||
| # easy to find information online about the many, many options to analyze. | # easy to find information online about the many, many options to analyze. | ||||||
|  |  | ||||||
|  |  | ||||||
| # Make a graph from this dataframe | # Make a graph from this dataframe | ||||||
| ?igraph::graph_from_data_frame | ?igraph::graph_from_data_frame | ||||||
|  |  | ||||||
| gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE) | gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE) | ||||||
|  |  | ||||||
| # CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges - | # CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges - | ||||||
| # layout of such large graphs is possible, but requires specialized code. Google | # layout of such large graphs is possible, but requires specialized code. Google | ||||||
| # for <layout large graphs> if you are curious. Also, consider what one can | # for <layout large graphs> if you are curious. Also, consider what one can | ||||||
| # really learn from plotting such a graph ... | # really learn from plotting such a graph ... | ||||||
|  |  | ||||||
| # Of course simple computations on this graph are reasonably fast: | # Of course simple computations on this graph are reasonably fast: | ||||||
|  |  | ||||||
| compSTR <- igraph::components(gSTR) | compSTR <- igraph::components(gSTR) | ||||||
| summary(compSTR) # our graph is fully connected! | summary(compSTR) # our graph is fully connected! | ||||||
|  |  | ||||||
| hist(log(igraph::degree(gSTR)), col="#FEE0AF") | hist(log(igraph::degree(gSTR)), col="#FEE0AF") | ||||||
| # this actually does look rather scale-free | # this actually does look rather scale-free | ||||||
|  |  | ||||||
| (freqRank <- table(igraph::degree(gSTR))) | (freqRank <- table(igraph::degree(gSTR))) | ||||||
| plot(log10(as.numeric(names(freqRank)) + 1), | plot(log10(as.numeric(names(freqRank)) + 1), | ||||||
|      log10(as.numeric(freqRank)), type = "b", |      log10(as.numeric(freqRank)), type = "b", | ||||||
|      pch = 21, bg = "#FEE0AF", |      pch = 21, bg = "#FEE0AF", | ||||||
|      xlab = "log(Rank)", ylab = "log(frequency)", |      xlab = "log(Rank)", ylab = "log(frequency)", | ||||||
|      main = "8,400 nodes from the human functional interaction network") |      main = "8,400 nodes from the human functional interaction network") | ||||||
|  |  | ||||||
| # This looks very scale-free indeed. | # This looks very scale-free indeed. | ||||||
|  |  | ||||||
| (regressionLine <- lm(log10(as.numeric(freqRank)) ~ | (regressionLine <- lm(log10(as.numeric(freqRank)) ~ | ||||||
|                       log10(as.numeric(names(freqRank)) + 1))) |                       log10(as.numeric(names(freqRank)) + 1))) | ||||||
| abline(regressionLine, col = "firebrick") | abline(regressionLine, col = "firebrick") | ||||||
|  |  | ||||||
| # Now explore some more: | # Now explore some more: | ||||||
|  |  | ||||||
| # ==   2.1  Cliques  =========================================================== | # ==   2.1  Cliques  =========================================================== | ||||||
|  |  | ||||||
| # Let's find the largest cliques. Remember: a clique is a fully connected | # Let's find the largest cliques. Remember: a clique is a fully connected | ||||||
| # subgraph, i.e. a subgraph in which every node is connected to every other. | # subgraph, i.e. a subgraph in which every node is connected to every other. | ||||||
| # Biological complexes often appear as cliques in interaction graphs. | # Biological complexes often appear as cliques in interaction graphs. | ||||||
|  |  | ||||||
| igraph::clique_num(gSTR) | igraph::clique_num(gSTR) | ||||||
| # The largest clique has 81 members. | # The largest clique has 81 members. | ||||||
|  |  | ||||||
| (C <- igraph::largest_cliques(gSTR)[[1]]) | (C <- igraph::largest_cliques(gSTR)[[1]]) | ||||||
|  |  | ||||||
| # Pick one of the proteins and find out what this fully connected cluster of 81 | # Pick one of the proteins and find out what this fully connected cluster of 81 | ||||||
| # proteins is (you can simply Google for any of the IDs). Is this expected? | # proteins is (you can simply Google for any of the IDs). Is this expected? | ||||||
|  |  | ||||||
| # Plot this ... | # Plot this ... | ||||||
| R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices | R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices | ||||||
|  |  | ||||||
| # color the vertices along a color spectrum | # color the vertices along a color spectrum | ||||||
| vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes | vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes | ||||||
|  |  | ||||||
| # color the edges to have the same color as the originating node | # color the edges to have the same color as the originating node | ||||||
| eCol <- character() | eCol <- character() | ||||||
| for (i in seq_along(vCol)) { | for (i in seq_along(vCol)) { | ||||||
|   eCol <- c(eCol, rep(vCol[i], igraph::gorder(R))) |   eCol <- c(eCol, rep(vCol[i], igraph::gorder(R))) | ||||||
| } | } | ||||||
|  |  | ||||||
| oPar <- par(mar= rep(0,4)) # Turn margins off | oPar <- par(mar= rep(0,4)) # Turn margins off | ||||||
| plot(R, | plot(R, | ||||||
|      layout = igraph::layout_in_circle(R), |      layout = igraph::layout_in_circle(R), | ||||||
|      vertex.size = 3, |      vertex.size = 3, | ||||||
|      vertex.color = vCol, |      vertex.color = vCol, | ||||||
|      edge.color = eCol, |      edge.color = eCol, | ||||||
|      edge.width = 0.1, |      edge.width = 0.1, | ||||||
|      vertex.label = NA) |      vertex.label = NA) | ||||||
| par(oPar) | par(oPar) | ||||||
|  |  | ||||||
| # ... well: remember: a clique means every node is connected to every other | # ... well: remember: a clique means every node is connected to every other | ||||||
| # node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI | # node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI | ||||||
| # networks looks like for large complexes. | # networks looks like for large complexes. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   2.2  Communities  ======================================================= | # ==   2.2  Communities  ======================================================= | ||||||
|  |  | ||||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | set.seed(112358)                       # set RNG seed for repeatable randomness | ||||||
| gSTRclusters <- igraph::cluster_infomap(gSTR) | gSTRclusters <- igraph::cluster_infomap(gSTR) | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| igraph::modularity(gSTRclusters) # ... measures how separated the different | igraph::modularity(gSTRclusters) # ... measures how separated the different | ||||||
|                                  # membership types are from each other |                                  # membership types are from each other | ||||||
| tMem <- table(igraph::membership(gSTRclusters)) | tMem <- table(igraph::membership(gSTRclusters)) | ||||||
| length(tMem)  # About 700 communities identified | length(tMem)  # About 700 communities identified | ||||||
| hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ... | hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ... | ||||||
| range(tMem) # ... but one has > 200 members | range(tMem) # ... but one has > 200 members | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   2.3  Betweenness Centrality  ============================================ | # ==   2.3  Betweenness Centrality  ============================================ | ||||||
|  |  | ||||||
| # Let's find the nodes with the 10 - highest betweenness centralities. | # Let's find the nodes with the 10 - highest betweenness centralities. | ||||||
| # | # | ||||||
| BC <- igraph::centr_betw(gSTR) | BC <- igraph::centr_betw(gSTR) | ||||||
|  |  | ||||||
| # remember: BC$res contains the results | # remember: BC$res contains the results | ||||||
| head(BC$res) | head(BC$res) | ||||||
|  |  | ||||||
| BC$res[1]   # betweenness centrality of node 1 in the graph ... | BC$res[1]   # betweenness centrality of node 1 in the graph ... | ||||||
| # ... which one is node 1? | # ... which one is node 1? | ||||||
| igraph::V(gSTR)[1] | igraph::V(gSTR)[1] | ||||||
|  |  | ||||||
| # to get the ten-highest nodes, we simply label the elements of BC with their | # to get the ten-highest nodes, we simply label the elements of BC with their | ||||||
| # index ... | # index ... | ||||||
| names(BC$res) <- as.character(1:length(BC$res)) | names(BC$res) <- as.character(1:length(BC$res)) | ||||||
|  |  | ||||||
| # ... and then we sort: | # ... and then we sort: | ||||||
| sBC <- sort(BC$res, decreasing = TRUE) | sBC <- sort(BC$res, decreasing = TRUE) | ||||||
| head(sBC) | head(sBC) | ||||||
|  |  | ||||||
| # This ordered vector means: node 3 has the highest betweenness centrality, | # This ordered vector means: node 3 has the highest betweenness centrality, | ||||||
| # node 721 has the second highest, etc. | # node 721 has the second highest, etc. | ||||||
|  |  | ||||||
| (BCsel <- as.numeric(names(sBC)[1:10])) | (BCsel <- as.numeric(names(sBC)[1:10])) | ||||||
|  |  | ||||||
| # We can use the first ten labels to subset the nodes in gSTR and fetch the | # We can use the first ten labels to subset the nodes in gSTR and fetch the | ||||||
| # IDs... | # IDs... | ||||||
| (ENSPsel <- names(igraph::V(gSTR)[BCsel])) | (ENSPsel <- names(igraph::V(gSTR)[BCsel])) | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| # ===== | # ===== | ||||||
| # IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT | # IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT | ||||||
| # We are going to use these IDs to produce some output for a submitted task: | # We are going to use these IDs to produce some output for a submitted task: | ||||||
| # therefore I need you to execute the following line, note the "seal" that this | # therefore I need you to execute the following line, note the "seal" that this | ||||||
| # returns, and not change myENSPsel later: | # returns, and not change myENSPsel later: | ||||||
|  |  | ||||||
| myENSPsel <- selectENSP(ENSPsel) | myENSPsel <- selectENSP(ENSPsel) | ||||||
|  |  | ||||||
| #  Next, to find what these proteins are... | #  Next, to find what these proteins are... | ||||||
|  |  | ||||||
| # We could now Google for all of these IDs to learn more about them. But really, | # We could now Google for all of these IDs to learn more about them. But really, | ||||||
| # googling for IDs one after the other, that would be lame. Let's instead use | # googling for IDs one after the other, that would be lame. Let's instead use | ||||||
| # the very, very useful biomaRt package to translate these Ensemble IDs into | # the very, very useful biomaRt package to translate these Ensemble IDs into | ||||||
| # gene symbols. | # gene symbols. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  biomaRt  ============================================================= | # =    3  biomaRt  ============================================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| # IDs are just labels, but for _bio_informatics we need to learn more about the | # IDs are just labels, but for _bio_informatics we need to learn more about the | ||||||
| # biological function of the genes or proteins that we retrieve via graph data | # biological function of the genes or proteins that we retrieve via graph data | ||||||
| # mining. biomaRt is the tool of choice. It's a package distributed by the | # mining. biomaRt is the tool of choice. It's a package distributed by the | ||||||
| # bioconductor project. This here is not a biomaRt tutorial (that's for another | # bioconductor project. This here is not a biomaRt tutorial (that's for another | ||||||
| # day), simply a few lines of sample code to get you started on the specific use | # day), simply a few lines of sample code to get you started on the specific use | ||||||
| # case of retrieving descriptions for ensembl protein IDs. | # case of retrieving descriptions for ensembl protein IDs. | ||||||
|  |  | ||||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
| if (! requireNamespace("biomaRt", quietly = TRUE)) { | if (! requireNamespace("biomaRt", quietly = TRUE)) { | ||||||
|   BiocManager::install("biomaRt") |   BiocManager::install("biomaRt") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = biomaRt)       # basic information | #  library(help = biomaRt)       # basic information | ||||||
| #  browseVignettes("biomaRt")    # available vignettes | #  browseVignettes("biomaRt")    # available vignettes | ||||||
| #  data(package = "biomaRt")     # available datasets | #  data(package = "biomaRt")     # available datasets | ||||||
|  |  | ||||||
| # define which dataset to use ... this takes a while for download | # define which dataset to use ... this takes a while for download | ||||||
| myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl") | myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl") | ||||||
|  |  | ||||||
| # what filters are defined? | # what filters are defined? | ||||||
| ( filters <- biomaRt::listFilters(myMart) ) | ( filters <- biomaRt::listFilters(myMart) ) | ||||||
|  |  | ||||||
|  |  | ||||||
| # and what attributes can we filter for? | # and what attributes can we filter for? | ||||||
| ( attributes <- biomaRt::listAttributes(myMart) ) | ( attributes <- biomaRt::listAttributes(myMart) ) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Soooo many options - let's look for the correct name of filters that are | # Soooo many options - let's look for the correct name of filters that are | ||||||
| # useful for ENSP IDs ... | # useful for ENSP IDs ... | ||||||
| filters[grep("ENSP", filters$description), ] | filters[grep("ENSP", filters$description), ] | ||||||
|  |  | ||||||
| # ... and the correct attribute names for gene symbols and descriptions ... | # ... and the correct attribute names for gene symbols and descriptions ... | ||||||
| attributes[grep("symbol", attributes$description, ignore.case = TRUE), ] | attributes[grep("symbol", attributes$description, ignore.case = TRUE), ] | ||||||
| attributes[grep("description", attributes$description, ignore.case = TRUE), ] | attributes[grep("description", attributes$description, ignore.case = TRUE), ] | ||||||
|  |  | ||||||
|  |  | ||||||
| # ... so we can put this together: here is a syntax example: | # ... so we can put this together: here is a syntax example: | ||||||
| biomaRt::getBM(filters = "ensembl_peptide_id", | biomaRt::getBM(filters = "ensembl_peptide_id", | ||||||
|                attributes = c("hgnc_symbol", |                attributes = c("hgnc_symbol", | ||||||
|                               "wikigene_description", |                               "wikigene_description", | ||||||
|                               "interpro_description", |                               "interpro_description", | ||||||
|                               "phenotype_description"), |                               "phenotype_description"), | ||||||
|                values = "ENSP00000000442", |                values = "ENSP00000000442", | ||||||
|                mart = myMart) |                mart = myMart) | ||||||
|  |  | ||||||
| # A simple loop will now get us the information for our 10 most central genes | # A simple loop will now get us the information for our 10 most central genes | ||||||
| # from the human subset of STRING. | # from the human subset of STRING. | ||||||
|  |  | ||||||
| CPdefs <- list()  # Since we don't know how many matches one of our queries | CPdefs <- list()  # Since we don't know how many matches one of our queries | ||||||
| # will return, we'll put the result dataframes into a list. | # will return, we'll put the result dataframes into a list. | ||||||
|  |  | ||||||
| for (ID in myENSPsel) { | for (ID in myENSPsel) { | ||||||
|   CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id", |   CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id", | ||||||
|                                  attributes = c("hgnc_symbol", |                                  attributes = c("hgnc_symbol", | ||||||
|                                                 "wikigene_description", |                                                 "wikigene_description", | ||||||
|                                                 "interpro_description", |                                                 "interpro_description", | ||||||
|                                                 "phenotype_description"), |                                                 "phenotype_description"), | ||||||
|                                  values = ID, |                                  values = ID, | ||||||
|                                  mart = myMart) |                                  mart = myMart) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # So what are the proteins with the ten highest betweenness centralities? | # So what are the proteins with the ten highest betweenness centralities? | ||||||
| #  ... are you surprised? (I am! Really.) | #  ... are you surprised? (I am! Really.) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Task for submission  ================================================= | # =    4  Task for submission  ================================================= | ||||||
|  |  | ||||||
| # Write a loop that will go through your personalized list of Ensemble IDs and | # Write a loop that will go through your personalized list of Ensemble IDs and | ||||||
| #    for each ID: | #    for each ID: | ||||||
| #    --  print the ID, | #    --  print the ID, | ||||||
| #    --  print the first row's HGNC symbol, | #    --  print the first row's HGNC symbol, | ||||||
| #    --  print the first row's wikigene description. | #    --  print the first row's wikigene description. | ||||||
| #    --  print the first row's phenotype. | #    --  print the first row's phenotype. | ||||||
| # | # | ||||||
| # Write your thoughts about this group of genes. | # Write your thoughts about this group of genes. | ||||||
| # | # | ||||||
| # (Hint, you can structure your loop in the same way as the loop that | # (Hint, you can structure your loop in the same way as the loop that | ||||||
| # created CPdefs. ) | # created CPdefs. ) | ||||||
|  |  | ||||||
| # Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code | # Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code | ||||||
| # for this loop and its output into your report if you are submitting | # for this loop and its output into your report if you are submitting | ||||||
| # anything for credit for this unit. Please read the requirements carefully. | # anything for credit for this unit. Please read the requirements carefully. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,252 +1,252 @@ | |||||||
| # tocID <- "BIN-SEQA-Composition.R" | # tocID <- "BIN-SEQA-Composition.R" | ||||||
| # | # | ||||||
| # Purpose: A Bioinformatics Course: | # Purpose: A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-SEQA-Comparison unit | #              R code accompanying the BIN-SEQA-Comparison unit | ||||||
| # | # | ||||||
| # Version: 1.2 | # Version: 1.2 | ||||||
| # | # | ||||||
| # Date:    2017-11  -  2020-09 | # Date:    2017-11  -  2020-09 | ||||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| #           1.2    2020 Maintenance | #           1.2    2020 Maintenance | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #                      use Biocmanager:: not biocLite() | #                      use Biocmanager:: not biocLite() | ||||||
| # Versions: | # Versions: | ||||||
| #           1.0    First live version 2017 | #           1.0    First live version 2017 | ||||||
| #           0.1    First code copied from BCH441_A03_makeYFOlist.R | #           0.1    First code copied from BCH441_A03_makeYFOlist.R | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||||
| # | # | ||||||
| # DO NOT SIMPLY  source()  THESE FILES! | # DO NOT SIMPLY  source()  THESE FILES! | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| #  going on. That's not how it works ... | #  going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                      Line | #TOC>   Section  Title                                      Line | ||||||
| #TOC> ---------------------------------------------------------- | #TOC> ---------------------------------------------------------- | ||||||
| #TOC>   1        Preparation                                  48 | #TOC>   1        Preparation                                  48 | ||||||
| #TOC>   2        Aggregate properties                         69 | #TOC>   2        Aggregate properties                         69 | ||||||
| #TOC>   3        Sequence Composition Enrichment             113 | #TOC>   3        Sequence Composition Enrichment             113 | ||||||
| #TOC>   3.1        Barplot, and side-by-side barplot         136 | #TOC>   3.1        Barplot, and side-by-side barplot         136 | ||||||
| #TOC>   3.2        Plotting ratios                           171 | #TOC>   3.2        Plotting ratios                           171 | ||||||
| #TOC>   3.3        Plotting log ratios                       188 | #TOC>   3.3        Plotting log ratios                       188 | ||||||
| #TOC>   3.4        Sort by frequency                         204 | #TOC>   3.4        Sort by frequency                         204 | ||||||
| #TOC>   3.5        Color by amino acid type                  221 | #TOC>   3.5        Color by amino acid type                  221 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Preparation  ========================================================= | # =    1  Preparation  ========================================================= | ||||||
|  |  | ||||||
| if (! requireNamespace("seqinr", quietly = TRUE)) { | if (! requireNamespace("seqinr", quietly = TRUE)) { | ||||||
|   install.packages("seqinr") |   install.packages("seqinr") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = seqinr)       # basic information | #  library(help = seqinr)       # basic information | ||||||
| #  browseVignettes("seqinr")    # available vignettes | #  browseVignettes("seqinr")    # available vignettes | ||||||
| #  data(package = "seqinr")     # available datasets | #  data(package = "seqinr")     # available datasets | ||||||
|  |  | ||||||
| # Load a reference sequence to work with: | # Load a reference sequence to work with: | ||||||
|  |  | ||||||
| # If you have done the BIN-Storing_data unit: | # If you have done the BIN-Storing_data unit: | ||||||
|    source("makeProteinDB.R") |    source("makeProteinDB.R") | ||||||
|    sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE))) |    sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE))) | ||||||
|    mySeq <- myDB$protein$sequence[sel] |    mySeq <- myDB$protein$sequence[sel] | ||||||
|  |  | ||||||
| # If not, use the yeast Mbp1 sequence: | # If not, use the yeast Mbp1 sequence: | ||||||
|    mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence) |    mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Aggregate properties  ================================================ | # =    2  Aggregate properties  ================================================ | ||||||
|  |  | ||||||
|  |  | ||||||
| # Let's try a simple function from seqinr: computing the pI of the sequence | # Let's try a simple function from seqinr: computing the pI of the sequence | ||||||
| ?seqinr::computePI | ?seqinr::computePI | ||||||
|  |  | ||||||
| # This takes as input a vector of upper-case AA codes | # This takes as input a vector of upper-case AA codes | ||||||
|  |  | ||||||
| # We can use the function strsplit() to split the string | # We can use the function strsplit() to split the string | ||||||
| # into single characters | # into single characters | ||||||
|  |  | ||||||
| (s <- strsplit(mySeq, "")) # splitting on the empty spring | (s <- strsplit(mySeq, "")) # splitting on the empty spring | ||||||
|                            # splits into single characters |                            # splits into single characters | ||||||
| s <- unlist(s)             # strsplit() returns a list! Why? | s <- unlist(s)             # strsplit() returns a list! Why? | ||||||
|                            # (But we don't need a list now...) |                            # (But we don't need a list now...) | ||||||
|  |  | ||||||
| # Alternatively, seqinr provides | # Alternatively, seqinr provides | ||||||
| # the function s2c() to convert strings into | # the function s2c() to convert strings into | ||||||
| # character vectors (and c2s to convert them back). | # character vectors (and c2s to convert them back). | ||||||
|  |  | ||||||
| seqinr::s2c(mySeq) | seqinr::s2c(mySeq) | ||||||
|  |  | ||||||
|  |  | ||||||
| seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point | seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point | ||||||
| seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight | seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight | ||||||
| seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of | seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of | ||||||
|                                        # values along the sequence |                                        # values along the sequence | ||||||
|  |  | ||||||
| # A true Labor of Love has gone into the | # A true Labor of Love has gone into the | ||||||
| # compilation of the "aaindex" data: | # compilation of the "aaindex" data: | ||||||
|  |  | ||||||
| ?seqinr::aaindex | ?seqinr::aaindex | ||||||
| data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it | data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it | ||||||
|                                    # accessible as an R object |                                    # accessible as an R object | ||||||
|  |  | ||||||
| length(aaindex)  # no seqinr:: needed for the dataset since we just | length(aaindex)  # no seqinr:: needed for the dataset since we just | ||||||
|                  # "attached" it with data() |                  # "attached" it with data() | ||||||
|  |  | ||||||
| # Here are all the index descriptions | # Here are all the index descriptions | ||||||
| for (i in 1:length(aaindex)) { | for (i in 1:length(aaindex)) { | ||||||
|   cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) |   cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep="")) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Sequence Composition Enrichment  ===================================== | # =    3  Sequence Composition Enrichment  ===================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Lets use one of the indices to calculate and plot amino-acid | # Lets use one of the indices to calculate and plot amino-acid | ||||||
| # composition enrichment: | # composition enrichment: | ||||||
| aaindex[[459]]$D | aaindex[[459]]$D | ||||||
|  |  | ||||||
| # | # | ||||||
| # Let's construct an enrichment plot to compare average frequencies | # Let's construct an enrichment plot to compare average frequencies | ||||||
| # with the amino acid counts in our sequence. | # with the amino acid counts in our sequence. | ||||||
|  |  | ||||||
| (refData <- aaindex[[459]]$I)                # reference frequencies in % | (refData <- aaindex[[459]]$I)                # reference frequencies in % | ||||||
| names(refData) <- seqinr::a(names(refData))  # change names to single-letter | names(refData) <- seqinr::a(names(refData))  # change names to single-letter | ||||||
|                                              # code using seqinr's "a()" function |                                              # code using seqinr's "a()" function | ||||||
| sum(refData) | sum(refData) | ||||||
| refData        # ... in % | refData        # ... in % | ||||||
|  |  | ||||||
|  |  | ||||||
| # tabulate the amino acid counts in mySeq | # tabulate the amino acid counts in mySeq | ||||||
| (obsData <- table(seqinr::s2c(mySeq)))        # counts | (obsData <- table(seqinr::s2c(mySeq)))        # counts | ||||||
| (obsData <- 100 * (obsData / sum(obsData)))   # frequencies | (obsData <- 100 * (obsData / sum(obsData)))   # frequencies | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.1  Barplot, and side-by-side barplot  ================================= | # ==   3.1  Barplot, and side-by-side barplot  ================================= | ||||||
|  |  | ||||||
| barplot(obsData, col = "#CCCCCC", cex.names = 0.7) | barplot(obsData, col = "#CCCCCC", cex.names = 0.7) | ||||||
| abline(h = 100/20, col="#BB0000") | abline(h = 100/20, col="#BB0000") | ||||||
|  |  | ||||||
| barplot(refData, col = "#BB0000", cex.names = 0.7) | barplot(refData, col = "#BB0000", cex.names = 0.7) | ||||||
| abline(h = 100/20, col="#555555") | abline(h = 100/20, col="#555555") | ||||||
|  |  | ||||||
| # Ok: first problem - the values in obsData are in alphabetical order. But the | # Ok: first problem - the values in obsData are in alphabetical order. But the | ||||||
| # values in refData are in alphabetical order of amino acid name: alanine, | # values in refData are in alphabetical order of amino acid name: alanine, | ||||||
| # arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this | # arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this | ||||||
| # order a lot - one of the old biochemistry tropes in the field. So we need to | # order a lot - one of the old biochemistry tropes in the field. So we need to | ||||||
| # re-order one of the vectors to match the other. That's easy though: | # re-order one of the vectors to match the other. That's easy though: | ||||||
| refData | refData | ||||||
| (refData <- refData[names(obsData)]) | (refData <- refData[names(obsData)]) | ||||||
|  |  | ||||||
| barplot(refData, col = "#BB0000", cex.names = 0.7) | barplot(refData, col = "#BB0000", cex.names = 0.7) | ||||||
| abline(h = 100/20, col="#555555") | abline(h = 100/20, col="#555555") | ||||||
|  |  | ||||||
| # To compare the values, we want to see them in a barplot, side-by-side ... | # To compare the values, we want to see them in a barplot, side-by-side ... | ||||||
| barplot(rbind(obsData, refData), | barplot(rbind(obsData, refData), | ||||||
|         ylim = c(0, 12), |         ylim = c(0, 12), | ||||||
|         beside = TRUE, |         beside = TRUE, | ||||||
|         col = c("#CCCCCC", "#BB0000"), |         col = c("#CCCCCC", "#BB0000"), | ||||||
|         cex.names = 0.7) |         cex.names = 0.7) | ||||||
| abline(h = 100/20, col="#00000044") | abline(h = 100/20, col="#00000044") | ||||||
|  |  | ||||||
| # ... and add a legend | # ... and add a legend | ||||||
| legend (x = 1, y = 12, | legend (x = 1, y = 12, | ||||||
|         legend = c("mySeq", "Average composition"), |         legend = c("mySeq", "Average composition"), | ||||||
|         fill = c("#CCCCCC", "#BB0000"), |         fill = c("#CCCCCC", "#BB0000"), | ||||||
|         cex = 0.7, |         cex = 0.7, | ||||||
|         bty = "n") |         bty = "n") | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.2  Plotting ratios  =================================================== | # ==   3.2  Plotting ratios  =================================================== | ||||||
|  |  | ||||||
| # To better compare the values, we'll calculate ratios between | # To better compare the values, we'll calculate ratios between | ||||||
| # obsData and refData | # obsData and refData | ||||||
|  |  | ||||||
| barplot(obsData / refData, | barplot(obsData / refData, | ||||||
|         col = "#CCCCCC", |         col = "#CCCCCC", | ||||||
|         ylab = "Sequence / Average", |         ylab = "Sequence / Average", | ||||||
|         ylim = c(0, 2.5), |         ylim = c(0, 2.5), | ||||||
|         cex.names = 0.7) |         cex.names = 0.7) | ||||||
| abline(h = 1, col="#BB0000") | abline(h = 1, col="#BB0000") | ||||||
| abline(h = c(1/2, 2), lty = 2, col="#BB000055") | abline(h = c(1/2, 2), lty = 2, col="#BB000055") | ||||||
|  |  | ||||||
| # ... but  ratios are not very good here, since the difference in height on the | # ... but  ratios are not very good here, since the difference in height on the | ||||||
| # plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted | # plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted | ||||||
| # lines) are exactly the same fold-difference ! | # lines) are exactly the same fold-difference ! | ||||||
|  |  | ||||||
| # ==   3.3  Plotting log ratios  =============================================== | # ==   3.3  Plotting log ratios  =============================================== | ||||||
|  |  | ||||||
| # A better way to display this | # A better way to display this | ||||||
| # is to plot log(ratios). | # is to plot log(ratios). | ||||||
|  |  | ||||||
| barplot(log(obsData / refData), | barplot(log(obsData / refData), | ||||||
|         col = "#CCCCCC", |         col = "#CCCCCC", | ||||||
|         ylab = "log(Sequence / Average)", |         ylab = "log(Sequence / Average)", | ||||||
|         ylim = log(c(1/3, 3)), |         ylim = log(c(1/3, 3)), | ||||||
|         cex.names = 0.7) |         cex.names = 0.7) | ||||||
| abline(h = log(1), col="#BB0000") | abline(h = log(1), col="#BB0000") | ||||||
| abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") | abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") | ||||||
|  |  | ||||||
| # Note how the two-fold difference lines are now the same distance from the | # Note how the two-fold difference lines are now the same distance from the | ||||||
| # line of equal ratio. | # line of equal ratio. | ||||||
|  |  | ||||||
| # ==   3.4  Sort by frequency  ================================================= | # ==   3.4  Sort by frequency  ================================================= | ||||||
|  |  | ||||||
| barplot(sort(log(obsData / refData), decreasing = TRUE), | barplot(sort(log(obsData / refData), decreasing = TRUE), | ||||||
|         ylim = log(c(1/3, 3)), |         ylim = log(c(1/3, 3)), | ||||||
|         col = "#CCCCCC", |         col = "#CCCCCC", | ||||||
|         ylab = "log(Sequence / Average)", |         ylab = "log(Sequence / Average)", | ||||||
|         cex.names = 0.7) |         cex.names = 0.7) | ||||||
| abline(h = log(1), col="#BB0000") | abline(h = log(1), col="#BB0000") | ||||||
| abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") | abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055") | ||||||
|  |  | ||||||
| yTxt <- log(0.9) | yTxt <- log(0.9) | ||||||
| arrows(4, yTxt, 0, yTxt, length = 0.07) | arrows(4, yTxt, 0, yTxt, length = 0.07) | ||||||
| text(5.5, yTxt, "Enriched", cex = 0.7) | text(5.5, yTxt, "Enriched", cex = 0.7) | ||||||
| yTxt <- log(1.1) | yTxt <- log(1.1) | ||||||
| arrows(20, yTxt, 24, yTxt, length = 0.07) | arrows(20, yTxt, 24, yTxt, length = 0.07) | ||||||
| text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) | text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) | ||||||
|  |  | ||||||
| # ==   3.5  Color by amino acid type  ========================================== | # ==   3.5  Color by amino acid type  ========================================== | ||||||
|  |  | ||||||
| # Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R | # Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R | ||||||
| # script, or define your own. | # script, or define your own. | ||||||
|  |  | ||||||
| barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5) | barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5) | ||||||
|  |  | ||||||
| lR <- sort(log(obsData / refData), decreasing = TRUE) | lR <- sort(log(obsData / refData), decreasing = TRUE) | ||||||
| barplot(lR, | barplot(lR, | ||||||
|         ylim = log(c(1/3, 3)), |         ylim = log(c(1/3, 3)), | ||||||
|         col = AACOLS[names(lR)], |         col = AACOLS[names(lR)], | ||||||
|         ylab = "log(Sequence / Average)", |         ylab = "log(Sequence / Average)", | ||||||
|         cex.names = 0.7) |         cex.names = 0.7) | ||||||
| abline(h = log(1), col="#00000055") | abline(h = log(1), col="#00000055") | ||||||
| abline(h = log(c(1/2, 2)), lty = 2, col="#00000033") | abline(h = log(c(1/2, 2)), lty = 2, col="#00000033") | ||||||
|  |  | ||||||
| yTxt <- log(0.9) | yTxt <- log(0.9) | ||||||
| arrows(4, yTxt, 0, yTxt, length = 0.07) | arrows(4, yTxt, 0, yTxt, length = 0.07) | ||||||
| text(5.5, yTxt, "Enriched", cex = 0.7) | text(5.5, yTxt, "Enriched", cex = 0.7) | ||||||
| yTxt <- log(1.1) | yTxt <- log(1.1) | ||||||
| arrows(20, yTxt, 24, yTxt, length = 0.07) | arrows(20, yTxt, 24, yTxt, length = 0.07) | ||||||
| text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) | text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| #   Interpret this plot. (Can you?) Which types of amino acids are enriched? | #   Interpret this plot. (Can you?) Which types of amino acids are enriched? | ||||||
| #   Depleted? | #   Depleted? | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										788
									
								
								BIN-Sequence.R
									
									
									
									
									
								
							
							
						
						
									
										788
									
								
								BIN-Sequence.R
									
									
									
									
									
								
							| @@ -1,394 +1,394 @@ | |||||||
| # tocID <- "BIN-Sequence.R" | # tocID <- "BIN-Sequence.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-Sequence unit. | #              R code accompanying the BIN-Sequence unit. | ||||||
| # | # | ||||||
| # Version:  1.5 | # Version:  1.5 | ||||||
| # | # | ||||||
| # Date:     2017-09  - 2020-09 | # Date:     2017-09  - 2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.5    2020 Updates | #           1.5    2020 Updates | ||||||
| #           1.4    Change from require() to requireNamespace(), | #           1.4    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #                      use Biocmanager:: not biocLite() | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.3    Update set.seed() usage | #           1.3    Update set.seed() usage | ||||||
| #           1.2    Removed irrelevant task. How did that even get in there? smh | #           1.2    Removed irrelevant task. How did that even get in there? smh | ||||||
| #           1.1    Add chartr() | #           1.1    Add chartr() | ||||||
| #           1.0    First live version 2017. | #           1.0    First live version 2017. | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC> | #TOC> | ||||||
| #TOC>   Section  Title                                Line | #TOC>   Section  Title                                Line | ||||||
| #TOC> ---------------------------------------------------- | #TOC> ---------------------------------------------------- | ||||||
| #TOC>   1        Prepare                                63 | #TOC>   1        Prepare                                63 | ||||||
| #TOC>   2        Storing Sequence                       80 | #TOC>   2        Storing Sequence                       80 | ||||||
| #TOC>   3        String properties                     109 | #TOC>   3        String properties                     109 | ||||||
| #TOC>   4        Substrings                            116 | #TOC>   4        Substrings                            116 | ||||||
| #TOC>   5        Creating strings: sprintf()           137 | #TOC>   5        Creating strings: sprintf()           137 | ||||||
| #TOC>   6        Changing strings                      172 | #TOC>   6        Changing strings                      172 | ||||||
| #TOC>   6.1.1          Changing case                   174 | #TOC>   6.1.1          Changing case                   174 | ||||||
| #TOC>   6.1.2          Reverse                         179 | #TOC>   6.1.2          Reverse                         179 | ||||||
| #TOC>   6.1.3          Change characters               183 | #TOC>   6.1.3          Change characters               183 | ||||||
| #TOC>   6.1.4          Substitute characters           211 | #TOC>   6.1.4          Substitute characters           211 | ||||||
| #TOC>   6.2        stringi and stringr                 231 | #TOC>   6.2        stringi and stringr                 231 | ||||||
| #TOC>   6.3        dbSanitizeSequence()                241 | #TOC>   6.3        dbSanitizeSequence()                241 | ||||||
| #TOC>   7        Permuting and sampling                253 | #TOC>   7        Permuting and sampling                253 | ||||||
| #TOC>   7.1        Permutations                        260 | #TOC>   7.1        Permutations                        260 | ||||||
| #TOC>   7.2        Sampling                            306 | #TOC>   7.2        Sampling                            306 | ||||||
| #TOC>   7.2.1          Equiprobable characters         308 | #TOC>   7.2.1          Equiprobable characters         308 | ||||||
| #TOC>   7.2.2          Defined probability vector      350 | #TOC>   7.2.2          Defined probability vector      350 | ||||||
| #TOC> | #TOC> | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Prepare  ============================================================= | # =    1  Prepare  ============================================================= | ||||||
|  |  | ||||||
| # Much basic sequence handling is supported by the Bioconductor package | # Much basic sequence handling is supported by the Bioconductor package | ||||||
| # Biostrings. | # Biostrings. | ||||||
|  |  | ||||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   BiocManager::install("Biostrings") |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")    # available vignettes | #  browseVignettes("Biostrings")    # available vignettes | ||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Storing Sequence  ==================================================== | # =    2  Storing Sequence  ==================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Sequences can be represented and stored as vectors of single characters ... | # Sequences can be represented and stored as vectors of single characters ... | ||||||
| (v <- c("D", "I", "V", "M", "T", "Q")) | (v <- c("D", "I", "V", "M", "T", "Q")) | ||||||
|  |  | ||||||
| # ... as strings ... | # ... as strings ... | ||||||
| (s <- "DIVMTQ") | (s <- "DIVMTQ") | ||||||
|  |  | ||||||
| # ... or as more complex objects with rich metadata e.g. as a Biostrings | # ... or as more complex objects with rich metadata e.g. as a Biostrings | ||||||
| # DNAstring, RNAstring, AAString, etc. | # DNAstring, RNAstring, AAString, etc. | ||||||
| (a <- Biostrings::AAString("DIVMTQ")) | (a <- Biostrings::AAString("DIVMTQ")) | ||||||
|  |  | ||||||
| # ... and all of these representations can be interconverted: | # ... and all of these representations can be interconverted: | ||||||
|  |  | ||||||
| # string to vector ... | # string to vector ... | ||||||
| unlist(strsplit(s, "")) | unlist(strsplit(s, "")) | ||||||
|  |  | ||||||
| # vector to string ... | # vector to string ... | ||||||
| paste(v, sep = "", collapse = "") | paste(v, sep = "", collapse = "") | ||||||
|  |  | ||||||
| # ... and AAstring to plain string. | # ... and AAstring to plain string. | ||||||
| as.character(a) | as.character(a) | ||||||
|  |  | ||||||
| # Since operations with character vectors trivially follow all other vector | # Since operations with character vectors trivially follow all other vector | ||||||
| # conventions and syntax, and we will look at Biostrings methods in more | # conventions and syntax, and we will look at Biostrings methods in more | ||||||
| # detail in a later unit, we will focus on basic strings in the following. | # detail in a later unit, we will focus on basic strings in the following. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  String properties  =================================================== | # =    3  String properties  =================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| length(s) # why ??? | length(s) # why ??? | ||||||
| nchar(s)  # Aha! | nchar(s)  # Aha! | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Substrings  ========================================================== | # =    4  Substrings  ========================================================== | ||||||
|  |  | ||||||
| # Use the substr() function | # Use the substr() function | ||||||
| substr(s, 2, 4) | substr(s, 2, 4) | ||||||
|  |  | ||||||
| # or the similar substring() | # or the similar substring() | ||||||
| substring(s, 2, 4) | substring(s, 2, 4) | ||||||
|  |  | ||||||
| # Note: both functions are vectorized (i.e. they operate on vectors | # Note: both functions are vectorized (i.e. they operate on vectors | ||||||
| # of arguments, you don't need to loop over input)... | # of arguments, you don't need to loop over input)... | ||||||
| myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA") | myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA") | ||||||
| substr(   myBiCodes, 1, 3) | substr(   myBiCodes, 1, 3) | ||||||
| substring(myBiCodes, 1, 3) | substring(myBiCodes, 1, 3) | ||||||
|  |  | ||||||
| # ... however only substring() will also use vectors for start and stop | # ... however only substring() will also use vectors for start and stop | ||||||
| s <- "gatattgtgatgacccagtaa"       # a DNA sequence | s <- "gatattgtgatgacccagtaa"       # a DNA sequence | ||||||
| (vI <- seq(1, nchar(s), by = 3))   # an index vector | (vI <- seq(1, nchar(s), by = 3))   # an index vector | ||||||
| substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet | substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet | ||||||
| substring(s, vI, vI+2)             # ... returns all triplets | substring(s, vI, vI+2)             # ... returns all triplets | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    5  Creating strings: sprintf()  ========================================= | # =    5  Creating strings: sprintf()  ========================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| # Sprintf is a very smart, very powerful function and has cognates in all | # Sprintf is a very smart, very powerful function and has cognates in all | ||||||
| # other programming languages. It has a bit of a  learning curve, but this is | # other programming languages. It has a bit of a  learning curve, but this is | ||||||
| # totally worth it: | # totally worth it: | ||||||
| # the function takes a format string, and a list of other arguments. It returns | # the function takes a format string, and a list of other arguments. It returns | ||||||
| # a formatted string. Here are some examples - watch carefully for sprintf() | # a formatted string. Here are some examples - watch carefully for sprintf() | ||||||
| # calls elsewhere in the code. | # calls elsewhere in the code. | ||||||
|  |  | ||||||
| sprintf("Just a string.") | sprintf("Just a string.") | ||||||
| sprintf("A string and the number %d.", 5) | sprintf("A string and the number %d.", 5) | ||||||
| sprintf("More numbers: %d ate %d.", 7, 9) # Sorry | sprintf("More numbers: %d ate %d.", 7, 9) # Sorry | ||||||
| sprintf("Pi is ~ %1.2f ...", pi) | sprintf("Pi is ~ %1.2f ...", pi) | ||||||
| sprintf("or more accurately ~ %1.11f.", pi) | sprintf("or more accurately ~ %1.11f.", pi) | ||||||
| x <- "bottles of beer" | x <- "bottles of beer" | ||||||
| N <- 99 | N <- 99 | ||||||
| sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.", | sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.", | ||||||
|         N, x, N, x, "one down, and pass it around", N - 1, x) |         N, x, N, x, "one down, and pass it around", N - 1, x) | ||||||
|  |  | ||||||
| # Note that in the last example, the value of the string was displayed with | # Note that in the last example, the value of the string was displayed with | ||||||
| # R's usual print-formatting function and therefore the line-break "\n" did | # R's usual print-formatting function and therefore the line-break "\n" did | ||||||
| # not actually break the line. To have line breaks, tabs etc, you need to use | # not actually break the line. To have line breaks, tabs etc, you need to use | ||||||
| # cat() to display the string: | # cat() to display the string: | ||||||
|  |  | ||||||
| for (i in N:(N-4)) { | for (i in N:(N-4)) { | ||||||
|   cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n", |   cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n", | ||||||
|               i, x, i, x, "one down, and pass it around", i - 1, x)) |               i, x, i, x, "one down, and pass it around", i - 1, x)) | ||||||
| } | } | ||||||
|  |  | ||||||
| # sprintf() is vectorized: if one of its parameters is a vector, it | # sprintf() is vectorized: if one of its parameters is a vector, it | ||||||
| # will generate one output string for each of the vector's elements: | # will generate one output string for each of the vector's elements: | ||||||
| cat(sprintf("\n%s fish", c("one", "two", "red", "blue"))) | cat(sprintf("\n%s fish", c("one", "two", "red", "blue"))) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    6  Changing strings  ==================================================== | # =    6  Changing strings  ==================================================== | ||||||
|  |  | ||||||
| # ===   6.1.1  Changing case | # ===   6.1.1  Changing case | ||||||
| tolower(s) | tolower(s) | ||||||
| toupper(tolower(s)) | toupper(tolower(s)) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ===   6.1.2  Reverse | # ===   6.1.2  Reverse | ||||||
| # (This used to work in Biostrings, apparently it doesn't work anymore. Why?) | # (This used to work in Biostrings, apparently it doesn't work anymore. Why?) | ||||||
| # Biostrings::str_rev(s) | # Biostrings::str_rev(s) | ||||||
| # The following works, of course, but awkward: | # The following works, of course, but awkward: | ||||||
| s | s | ||||||
| paste0(rev(unlist(strsplit(s, ""))), collapse = "") | paste0(rev(unlist(strsplit(s, ""))), collapse = "") | ||||||
|  |  | ||||||
| # reverse complement | # reverse complement | ||||||
| COMP <- c("t", "g", "c", "a") | COMP <- c("t", "g", "c", "a") | ||||||
| names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names | names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names | ||||||
| s | s | ||||||
| paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "") | paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "") | ||||||
|  |  | ||||||
|  |  | ||||||
| # ===   6.1.3  Change characters | # ===   6.1.3  Change characters | ||||||
| # chartr(old, new, x) maps all characters in x that appear in "old" to the | # chartr(old, new, x) maps all characters in x that appear in "old" to the | ||||||
| # correpsonding character in "new." Kind of like the COMP vector above ... | # correpsonding character in "new." Kind of like the COMP vector above ... | ||||||
|  |  | ||||||
| chartr("aeio", "uuuu", "We hold these truths to be self-evident ...") | chartr("aeio", "uuuu", "We hold these truths to be self-evident ...") | ||||||
|  |  | ||||||
| # One could implement toupper() and tolower() with this - remember that R has | # One could implement toupper() and tolower() with this - remember that R has | ||||||
| # character vectors of uppercase and lowercase letters as language constants. | # character vectors of uppercase and lowercase letters as language constants. | ||||||
| chartr(paste0(letters, collapse = ""), | chartr(paste0(letters, collapse = ""), | ||||||
|        paste0(LETTERS, collapse = ""), |        paste0(LETTERS, collapse = ""), | ||||||
|        "Twinkle, twinkle little star, how I wonder what you are.") |        "Twinkle, twinkle little star, how I wonder what you are.") | ||||||
|  |  | ||||||
| # One amusing way to use the function  is for a reversible substitution | # One amusing way to use the function  is for a reversible substitution | ||||||
| # cypher. | # cypher. | ||||||
| alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789" | alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789" | ||||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | set.seed(112358)                       # set RNG seed for repeatable randomness | ||||||
| ( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") ) | ( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") ) | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| # encode ... | # encode ... | ||||||
| (x <- chartr(alBet, myCypher, "... seven for a secret, never to be told.")) | (x <- chartr(alBet, myCypher, "... seven for a secret, never to be told.")) | ||||||
|  |  | ||||||
| # decode ... | # decode ... | ||||||
| chartr(myCypher, alBet, x) | chartr(myCypher, alBet, x) | ||||||
| # (Nb. substitution cyphers are easy to crack!) | # (Nb. substitution cyphers are easy to crack!) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ===   6.1.4  Substitute characters | # ===   6.1.4  Substitute characters | ||||||
| # gsub can change lengths. | # gsub can change lengths. | ||||||
| #   Example: implementing the binary Fibonacci sequence: | #   Example: implementing the binary Fibonacci sequence: | ||||||
| #   0 -> 1; 1 -> 10 , in three nested gsub() statements | #   0 -> 1; 1 -> 10 , in three nested gsub() statements | ||||||
| ( s <- 1 ) | ( s <- 1 ) | ||||||
| ( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) ) | ( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) ) | ||||||
|  |  | ||||||
| # Iterate this line a few times ... | # Iterate this line a few times ... | ||||||
| # | # | ||||||
| # cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html | # cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html | ||||||
| # for the features of the sequence. | # for the features of the sequence. | ||||||
|  |  | ||||||
| # I use gsub() often to delete unwanted characters ... | # I use gsub() often to delete unwanted characters ... | ||||||
| # ... select something, and substitute the empty string for it. | # ... select something, and substitute the empty string for it. | ||||||
| (s <- gsub("-", "", s)) | (s <- gsub("-", "", s)) | ||||||
|  |  | ||||||
| # For example: clean up a sequence | # For example: clean up a sequence | ||||||
| # copy/paste from UniProt | # copy/paste from UniProt | ||||||
| (s <- "        10         20         30         40         50 | (s <- "        10         20         30         40         50 | ||||||
| MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ") | MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ") | ||||||
|  |  | ||||||
|  |  | ||||||
| # remove numbers | # remove numbers | ||||||
| (s <- gsub("[0-9]", "", s)) | (s <- gsub("[0-9]", "", s)) | ||||||
|  |  | ||||||
| # remove "whitespace" (spaces, tabs, line breaks)... | # remove "whitespace" (spaces, tabs, line breaks)... | ||||||
| (s <- gsub("\\s", "", s)) | (s <- gsub("\\s", "", s)) | ||||||
|  |  | ||||||
| # ==   6.2  stringi and stringr  =============================================== | # ==   6.2  stringi and stringr  =============================================== | ||||||
|  |  | ||||||
| # But there are also specialized functions eg. to remove leading/trailing | # But there are also specialized functions eg. to remove leading/trailing | ||||||
| # whitespace which may be important to sanitize user input etc. Have a look at | # whitespace which may be important to sanitize user input etc. Have a look at | ||||||
| # the function descriptions for the stringr and the stringi package. stringr is | # the function descriptions for the stringr and the stringi package. stringr is | ||||||
| # part of the tidyverse, and for the most part a wrapper for stringi functions. | # part of the tidyverse, and for the most part a wrapper for stringi functions. | ||||||
| # https://github.com/tidyverse/stringr | # https://github.com/tidyverse/stringr | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   6.3  dbSanitizeSequence()  ============================================== | # ==   6.3  dbSanitizeSequence()  ============================================== | ||||||
|  |  | ||||||
| # In our learning units, we use a function dbSanitizeSequence() to clean up | # In our learning units, we use a function dbSanitizeSequence() to clean up | ||||||
| # sequences that may be copy/pasted from Web-sources | # sequences that may be copy/pasted from Web-sources | ||||||
|  |  | ||||||
| cat( s <- ">FASTA header will be removed | cat( s <- ">FASTA header will be removed | ||||||
| 10         20         30         40         50 | 10         20         30         40         50 | ||||||
| MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " ) | MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " ) | ||||||
|  |  | ||||||
| dbSanitizeSequence(s) | dbSanitizeSequence(s) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    7  Permuting and sampling  ============================================== | # =    7  Permuting and sampling  ============================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # An important aspect of working with strings is generating random strings | # An important aspect of working with strings is generating random strings | ||||||
| # with given statistical properties: reference items to evaluate significance. | # with given statistical properties: reference items to evaluate significance. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   7.1  Permutations  ====================================================== | # ==   7.1  Permutations  ====================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # One way to produce such reference items is to permute a string. A permuted | # One way to produce such reference items is to permute a string. A permuted | ||||||
| # string has the same composition as the original, but all positional | # string has the same composition as the original, but all positional | ||||||
| # information is lost. The sample() function can be used to permute: | # information is lost. The sample() function can be used to permute: | ||||||
|  |  | ||||||
| # This is the sequence of the ompA secretion signal | # This is the sequence of the ompA secretion signal | ||||||
| (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) | (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) | ||||||
|  |  | ||||||
| (x <- sample(s, length(s)))  # permuted | (x <- sample(s, length(s)))  # permuted | ||||||
|  |  | ||||||
| # Here's a small example how such permuted strings may be useful. As you look | # Here's a small example how such permuted strings may be useful. As you look | ||||||
| # at the ompA sequence, you suspect that the two lysines near the +-charged | # at the ompA sequence, you suspect that the two lysines near the +-charged | ||||||
| # N-terminus may not be accidental, but selected for a positively charged | # N-terminus may not be accidental, but selected for a positively charged | ||||||
| # N-terminus. What is the chance that such a sequence has two lysines close to | # N-terminus. What is the chance that such a sequence has two lysines close to | ||||||
| # the N-terminus simply by chance? Or put differently: what is the average | # the N-terminus simply by chance? Or put differently: what is the average | ||||||
| # distance of two lysines in such a sequence to the N-terminus. First, we | # distance of two lysines in such a sequence to the N-terminus. First, we | ||||||
| # need an expression that measures the distance. A simple use of the which() | # need an expression that measures the distance. A simple use of the which() | ||||||
| # function will do just fine. | # function will do just fine. | ||||||
|  |  | ||||||
| which(s == "K")        # shows they are in position 2 and 3, so ... | which(s == "K")        # shows they are in position 2 and 3, so ... | ||||||
| mean(which(s == "K"))  # ... gives us the average, and ... | mean(which(s == "K"))  # ... gives us the average, and ... | ||||||
| mean(which(x == "K"))  # ... gives us the average of the permuted sequence. | mean(which(x == "K"))  # ... gives us the average of the permuted sequence. | ||||||
|  |  | ||||||
| # So what does the distribution look like? Lets do 10,000 trials. | # So what does the distribution look like? Lets do 10,000 trials. | ||||||
|  |  | ||||||
| (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) | (s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", ""))) | ||||||
| N <- 10000 | N <- 10000 | ||||||
| d <- numeric(N) | d <- numeric(N) | ||||||
|  |  | ||||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | set.seed(112358)                       # set RNG seed for repeatable randomness | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   d[i] <- mean(which(sample(s, length(s)) == "K")) |   d[i] <- mean(which(sample(s, length(s)) == "K")) | ||||||
| } | } | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| hist(d, breaks = 20) | hist(d, breaks = 20) | ||||||
| abline(v = 2.5, lwd = 2, col = "firebrick") | abline(v = 2.5, lwd = 2, col = "firebrick") | ||||||
| sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the | sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the | ||||||
|               # N-terminus or more. That's just below the signifcance |               # N-terminus or more. That's just below the signifcance | ||||||
|               # threshold of 5 %. It's a trend, but to be sure we are looking |               # threshold of 5 %. It's a trend, but to be sure we are looking | ||||||
|               # at a biological effect we would need to see more |               # at a biological effect we would need to see more | ||||||
|               # sequences. |               # sequences. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   7.2  Sampling  ========================================================== | # ==   7.2  Sampling  ========================================================== | ||||||
|  |  | ||||||
| # ===   7.2.1  Equiprobable characters | # ===   7.2.1  Equiprobable characters | ||||||
|  |  | ||||||
| # Assume you need a large random-nucleotide string for some statistical model. | # Assume you need a large random-nucleotide string for some statistical model. | ||||||
| # How to create such a string? sample() can easily create it: | # How to create such a string? sample() can easily create it: | ||||||
|  |  | ||||||
| nuc <- c("A", "C", "G", "T") | nuc <- c("A", "C", "G", "T") | ||||||
| N <- 100 | N <- 100 | ||||||
|  |  | ||||||
| set.seed(16818)                        # set RNG seed for repeatable randomness | set.seed(16818)                        # set RNG seed for repeatable randomness | ||||||
| v <- sample(nuc, N, replace = TRUE) | v <- sample(nuc, N, replace = TRUE) | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| (mySeq <- paste(v, collapse = "")) | (mySeq <- paste(v, collapse = "")) | ||||||
|  |  | ||||||
| # What's the GC content? | # What's the GC content? | ||||||
| table(v) | table(v) | ||||||
| sum(table(v)[c("G", "C")]) # 51 is close to expected | sum(table(v)[c("G", "C")]) # 51 is close to expected | ||||||
|  |  | ||||||
| # What's the number of CpG motifs? Easy to check with the stringi | # What's the number of CpG motifs? Easy to check with the stringi | ||||||
| # stri_match_all() function | # stri_match_all() function | ||||||
|  |  | ||||||
| if (! requireNamespace("stringi", quietly = TRUE)) { | if (! requireNamespace("stringi", quietly = TRUE)) { | ||||||
|   install.packages("stringi") |   install.packages("stringi") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = stringi)       # basic information | #  library(help = stringi)       # basic information | ||||||
| #  browseVignettes("stringi")    # available vignettes | #  browseVignettes("stringi")    # available vignettes | ||||||
| #  data(package = "stringi")     # available datasets | #  data(package = "stringi")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| (x <- stringi::stri_match_all(mySeq, regex = "CG")) | (x <- stringi::stri_match_all(mySeq, regex = "CG")) | ||||||
| length(unlist(x)) | length(unlist(x)) | ||||||
|  |  | ||||||
| # Now you could compare that number with yeast DNA sequences, and determine | # Now you could compare that number with yeast DNA sequences, and determine | ||||||
| # whether there are more or less CpG motifs than expected by chance. | # whether there are more or less CpG motifs than expected by chance. | ||||||
| # (cf. https://en.wikipedia.org/wiki/CpG_site) | # (cf. https://en.wikipedia.org/wiki/CpG_site) | ||||||
| # But hold on: is that a fair comparison? sample() gives us all four nucleotides | # But hold on: is that a fair comparison? sample() gives us all four nucleotides | ||||||
| # with the same probability. But the yeast genomic DNA GC content is only | # with the same probability. But the yeast genomic DNA GC content is only | ||||||
| # 38%. So you would expect fewer CpG motifs based on the statistical properties | # 38%. So you would expect fewer CpG motifs based on the statistical properties | ||||||
| # of the smaller number of Cs and Gs - before biology even comes into play. How | # of the smaller number of Cs and Gs - before biology even comes into play. How | ||||||
| # do we account for that? | # do we account for that? | ||||||
|  |  | ||||||
| # ===   7.2.2  Defined probability vector | # ===   7.2.2  Defined probability vector | ||||||
|  |  | ||||||
| # This is where we need to know how to create samples with specific probability | # This is where we need to know how to create samples with specific probability | ||||||
| # distributions. A crude hack would be to create a sampling source vector with | # distributions. A crude hack would be to create a sampling source vector with | ||||||
| # 19 C, 19 G, 31 A and 31 T | # 19 C, 19 G, 31 A and 31 T | ||||||
| c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31)) | c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31)) | ||||||
| # ... but that doesn't scale if the numeric accuracy needs to be higher. | # ... but that doesn't scale if the numeric accuracy needs to be higher. | ||||||
| # | # | ||||||
| # However sample() has an argument that takes care of that: you can explicitly | # However sample() has an argument that takes care of that: you can explicitly | ||||||
| # specify the probabilities with which each element of the the sampling vector | # specify the probabilities with which each element of the the sampling vector | ||||||
| # should be chosen: | # should be chosen: | ||||||
|  |  | ||||||
| nuc <- c("A", "C", "G", "T") | nuc <- c("A", "C", "G", "T") | ||||||
| N <- 100 | N <- 100 | ||||||
| myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities | myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities | ||||||
|  |  | ||||||
| set.seed(16818)                       # set RNG seed for repeatable randomness | set.seed(16818)                       # set RNG seed for repeatable randomness | ||||||
| v <- sample(nuc, N, prob = myProb, replace = TRUE) | v <- sample(nuc, N, prob = myProb, replace = TRUE) | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| (mySeq <- paste(v, collapse = "")) | (mySeq <- paste(v, collapse = "")) | ||||||
|  |  | ||||||
| # What's the GC content? | # What's the GC content? | ||||||
| table(v) | table(v) | ||||||
| sum(table(v)[c("G", "C")]) # Close to expected | sum(table(v)[c("G", "C")]) # Close to expected | ||||||
|  |  | ||||||
| # What's the number of CpG motifs? | # What's the number of CpG motifs? | ||||||
| (x <- stringi::stri_match_all(mySeq, regex = "CG")) | (x <- stringi::stri_match_all(mySeq, regex = "CG")) | ||||||
| # ... not a single one in this case. | # ... not a single one in this case. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										1368
									
								
								BIN-Storing_data.R
									
									
									
									
									
								
							
							
						
						
									
										1368
									
								
								BIN-Storing_data.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,349 +1,349 @@ | |||||||
| # tocID <- "FND-Genetic_code.R" | # tocID <- "FND-Genetic_code.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the FND-Genetic_code unit. | #              R code accompanying the FND-Genetic_code unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017  10  -  2019  01 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Maintenance | #           1.2    2020 Maintenance | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #                      use Biocmanager:: not biocLite() | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0.1  Comment on "incomplete final line" warning in FASTA | #           1.0.1  Comment on "incomplete final line" warning in FASTA | ||||||
| #           1.0    First live version | #           1.0    First live version | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC> | #TOC> | ||||||
| #TOC>   Section  Title                                            Line | #TOC>   Section  Title                                            Line | ||||||
| #TOC> ---------------------------------------------------------------- | #TOC> ---------------------------------------------------------------- | ||||||
| #TOC>   1        Storing the genetic code                           45 | #TOC>   1        Storing the genetic code                           45 | ||||||
| #TOC>   1.1        Genetic code in Biostrings                       63 | #TOC>   1.1        Genetic code in Biostrings                       63 | ||||||
| #TOC>   2        Working with the genetic code                      94 | #TOC>   2        Working with the genetic code                      94 | ||||||
| #TOC>   2.1        Translate a sequence.                           129 | #TOC>   2.1        Translate a sequence.                           129 | ||||||
| #TOC>   3        An alternative representation: 3D array           212 | #TOC>   3        An alternative representation: 3D array           212 | ||||||
| #TOC>   3.1        Print a Genetic code table                      246 | #TOC>   3.1        Print a Genetic code table                      246 | ||||||
| #TOC>   4        Tasks                                             272 | #TOC>   4        Tasks                                             272 | ||||||
| #TOC> | #TOC> | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Storing the genetic code  ============================================ | # =    1  Storing the genetic code  ============================================ | ||||||
|  |  | ||||||
| # The genetic code maps trinucleotide codons to amino acids. To store it, we | # The genetic code maps trinucleotide codons to amino acids. To store it, we | ||||||
| # need some mechanism to associate the two representations. The most | # need some mechanism to associate the two representations. The most | ||||||
| # convenient way to do that is a "named vector" which holds the amino acid | # convenient way to do that is a "named vector" which holds the amino acid | ||||||
| # code and assigns the codons as names to its elements. | # code and assigns the codons as names to its elements. | ||||||
|  |  | ||||||
| x <- c("M", "H", "H", "*", "*", "*") | x <- c("M", "H", "H", "*", "*", "*") | ||||||
| names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA") | names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA") | ||||||
| x | x | ||||||
|  |  | ||||||
| # Then we can access the vector by the codon as name, and retrieve the | # Then we can access the vector by the codon as name, and retrieve the | ||||||
| # amino acid ... | # amino acid ... | ||||||
|  |  | ||||||
| x["ATG"] | x["ATG"] | ||||||
| x["CAC"] | x["CAC"] | ||||||
| x["TAA"] | x["TAA"] | ||||||
|  |  | ||||||
| # ... or the names of elements, to retrieve the codon(s) | # ... or the names of elements, to retrieve the codon(s) | ||||||
| names(x)[x == "M"] | names(x)[x == "M"] | ||||||
| names(x)[x == "H"] | names(x)[x == "H"] | ||||||
| names(x)[x == "*"] | names(x)[x == "*"] | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.1  Genetic code in Biostrings  ======================================== | # ==   1.1  Genetic code in Biostrings  ======================================== | ||||||
|  |  | ||||||
| # Coveniently, the standard genetic code as well as its alternatives are | # Coveniently, the standard genetic code as well as its alternatives are | ||||||
| # available in the Bioconductor "Biostrings" package: | # available in the Bioconductor "Biostrings" package: | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   BiocManager::install("Biostrings") |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")    # available vignettes | #  browseVignettes("Biostrings")    # available vignettes | ||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # The standard genetic code vector | # The standard genetic code vector | ||||||
| Biostrings::GENETIC_CODE | Biostrings::GENETIC_CODE | ||||||
|  |  | ||||||
| # The table of genetic codes. This information corresponds to this page | # The table of genetic codes. This information corresponds to this page | ||||||
| # at the NCBI: | # at the NCBI: | ||||||
| # https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes | # https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes | ||||||
| Biostrings::GENETIC_CODE_TABLE | Biostrings::GENETIC_CODE_TABLE | ||||||
|  |  | ||||||
| # Most of the alternative codes are mitochondrial codes. The id of the | # Most of the alternative codes are mitochondrial codes. The id of the | ||||||
| # Alternative Yeast Nuclear code is "12" | # Alternative Yeast Nuclear code is "12" | ||||||
| Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear | Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Working with the genetic code  ======================================= | # =    2  Working with the genetic code  ======================================= | ||||||
|  |  | ||||||
| # We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it | # We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it | ||||||
| # to a "local" variable, rather than retrieving it from the package all the | # to a "local" variable, rather than retrieving it from the package all the | ||||||
| # time. | # time. | ||||||
|  |  | ||||||
| GC <- Biostrings::GENETIC_CODE | GC <- Biostrings::GENETIC_CODE | ||||||
|  |  | ||||||
| # This is a named vector of characters ... | # This is a named vector of characters ... | ||||||
|  |  | ||||||
| str(GC) | str(GC) | ||||||
|  |  | ||||||
| # ... which also stores the alternative initiation codons TTG and CTG in | # ... which also stores the alternative initiation codons TTG and CTG in | ||||||
| # an attribute of the vector. (Alternative initiation codons sometimes are | # an attribute of the vector. (Alternative initiation codons sometimes are | ||||||
| # used instead of ATG to intiate translation, if translation is not initiated | # used instead of ATG to intiate translation, if translation is not initiated | ||||||
| # at ATG thses are still translated with fMet.) | # at ATG thses are still translated with fMet.) | ||||||
|  |  | ||||||
| attr(GC, "alt_init_codons") | attr(GC, "alt_init_codons") | ||||||
|  |  | ||||||
| # But the key to use this vector is in the "names" which we use for subsetting | # But the key to use this vector is in the "names" which we use for subsetting | ||||||
| # the list of amino acids in whatever way we need. | # the list of amino acids in whatever way we need. | ||||||
| names(GC) | names(GC) | ||||||
|  |  | ||||||
| # The translation of "TGG" ... | # The translation of "TGG" ... | ||||||
| GC["TGG"] | GC["TGG"] | ||||||
|  |  | ||||||
| # All stop codons | # All stop codons | ||||||
| names(GC)[GC == "*"] | names(GC)[GC == "*"] | ||||||
|  |  | ||||||
| # All start codons | # All start codons | ||||||
| names(GC)[GC == "M"] # ... or | names(GC)[GC == "M"] # ... or | ||||||
| c(names(GC)[GC == "M"], | c(names(GC)[GC == "M"], | ||||||
|   attr(GC, "alt_init_codons")) |   attr(GC, "alt_init_codons")) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   2.1  Translate a sequence.  ============================================= | # ==   2.1  Translate a sequence.  ============================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| # I have provided a gene sequence in the data directory: | # I have provided a gene sequence in the data directory: | ||||||
| # S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence. | # S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence. | ||||||
|  |  | ||||||
| # read it | # read it | ||||||
| mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||||
|  |  | ||||||
| # You will notice that this generates a Warning message: | # You will notice that this generates a Warning message: | ||||||
| #      Warning message: | #      Warning message: | ||||||
| #        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") : | #        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") : | ||||||
| #        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa' | #        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa' | ||||||
|  |  | ||||||
| # The reason for this is that the last character of the file is the letter "A" | # The reason for this is that the last character of the file is the letter "A" | ||||||
| # and not a "\n" line break. This file is exactly how it was sent from the | # and not a "\n" line break. This file is exactly how it was sent from the | ||||||
| # NCBI server; I think good, defensive programming practice would have been to | # NCBI server; I think good, defensive programming practice would have been to | ||||||
| # include some kind of an end-marker in the file, like a final "\n". This helps | # include some kind of an end-marker in the file, like a final "\n". This helps | ||||||
| # us recognize an incomplete transmission. Let's parse the actual sequence from | # us recognize an incomplete transmission. Let's parse the actual sequence from | ||||||
| # the file, and then check for completeness. | # the file, and then check for completeness. | ||||||
|  |  | ||||||
|  |  | ||||||
| head(mbp1) | head(mbp1) | ||||||
|  |  | ||||||
| # drop the first line (header) | # drop the first line (header) | ||||||
| mbp1 <- mbp1[-1] | mbp1 <- mbp1[-1] | ||||||
| head(mbp1) | head(mbp1) | ||||||
|  |  | ||||||
| # concatenate it all to a single string | # concatenate it all to a single string | ||||||
| mbp1 <- paste(mbp1, sep = "", collapse = "") | mbp1 <- paste(mbp1, sep = "", collapse = "") | ||||||
|  |  | ||||||
| # how long is it? | # how long is it? | ||||||
| nchar(mbp1) | nchar(mbp1) | ||||||
|  |  | ||||||
| # how many codons? | # how many codons? | ||||||
| nchar(mbp1)/3 | nchar(mbp1)/3 | ||||||
|  |  | ||||||
| # That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a | # That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a | ||||||
| # first verification that the file we read is complete, the nucleotides of a | # first verification that the file we read is complete, the nucleotides of a | ||||||
| # complete ORF should be divisible by 3. | # complete ORF should be divisible by 3. | ||||||
|  |  | ||||||
| # Extract the codons. There are many ways to split a long string into chunks | # Extract the codons. There are many ways to split a long string into chunks | ||||||
| # of three characters. Here we use the Biostrings  codons()  function. codons() | # of three characters. Here we use the Biostrings  codons()  function. codons() | ||||||
| # requires an object of type DNAstring - a special kind of string with | # requires an object of type DNAstring - a special kind of string with | ||||||
| # attributes that are useful for Biostrings. Thus we convert the sequence first | # attributes that are useful for Biostrings. Thus we convert the sequence first | ||||||
| # with DNAstring(), then split it up, then convert it into a plain | # with DNAstring(), then split it up, then convert it into a plain | ||||||
| # character vector. | # character vector. | ||||||
| mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1))) | mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1))) | ||||||
|  |  | ||||||
| head(mbp1Codons) | head(mbp1Codons) | ||||||
|  |  | ||||||
| # now translate each codon | # now translate each codon | ||||||
|  |  | ||||||
| mbp1AA <- character(834) | mbp1AA <- character(834) | ||||||
| for (i in seq_along(mbp1Codons)) { | for (i in seq_along(mbp1Codons)) { | ||||||
|   mbp1AA[i] <- GC[mbp1Codons[i]] |   mbp1AA[i] <- GC[mbp1Codons[i]] | ||||||
| } | } | ||||||
|  |  | ||||||
| head(mbp1Codons) | head(mbp1Codons) | ||||||
| head(mbp1AA) | head(mbp1AA) | ||||||
|  |  | ||||||
| tail(mbp1Codons) | tail(mbp1Codons) | ||||||
| tail(mbp1AA) # Note the stop! | tail(mbp1AA) # Note the stop! | ||||||
|  |  | ||||||
| # The TAA "ochre" stop codon is our second verification that the nucleotide | # The TAA "ochre" stop codon is our second verification that the nucleotide | ||||||
| # sequence is complete: a stop codon can't appear internally in an ORF. | # sequence is complete: a stop codon can't appear internally in an ORF. | ||||||
|  |  | ||||||
| # We can work with the mbp1AA vector, for example to tabulate the | # We can work with the mbp1AA vector, for example to tabulate the | ||||||
| # amino acid frequencies: | # amino acid frequencies: | ||||||
| table(mbp1AA) | table(mbp1AA) | ||||||
| sort(table(mbp1AA), decreasing = TRUE) | sort(table(mbp1AA), decreasing = TRUE) | ||||||
|  |  | ||||||
| # Or we can paste all elements together into a single string. But let's remove | # Or we can paste all elements together into a single string. But let's remove | ||||||
| # the stop, it's not actually a part of the sequence. To remove the last element | # the stop, it's not actually a part of the sequence. To remove the last element | ||||||
| # of a vector, re-assign it with a vector minus the index of the last element: | # of a vector, re-assign it with a vector minus the index of the last element: | ||||||
| mbp1AA <- mbp1AA[-(length(mbp1AA))] | mbp1AA <- mbp1AA[-(length(mbp1AA))] | ||||||
| tail(mbp1AA) # Note the stop is gone! | tail(mbp1AA) # Note the stop is gone! | ||||||
|  |  | ||||||
| # paste it together, collapsing the elements using an empty string as the | # paste it together, collapsing the elements using an empty string as the | ||||||
| # separation-character (i.e.: nothing) | # separation-character (i.e.: nothing) | ||||||
| (Mbp1 <- paste(mbp1AA, sep = "", collapse = "")) | (Mbp1 <- paste(mbp1AA, sep = "", collapse = "")) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  An alternative representation: 3D array  ============================= | # =    3  An alternative representation: 3D array  ============================= | ||||||
|  |  | ||||||
|  |  | ||||||
| # We don't use 3D arrays often - usually just 2D tables and data frames, so | # We don't use 3D arrays often - usually just 2D tables and data frames, so | ||||||
| # here is a good opportunity to review the syntax of 3D arrays with a | # here is a good opportunity to review the syntax of 3D arrays with a | ||||||
| # genetic code cube: | # genetic code cube: | ||||||
|  |  | ||||||
| # Initialize, using A G C T as the names of the elements in each dimension | # Initialize, using A G C T as the names of the elements in each dimension | ||||||
| cCube <- array(data     = character(64), | cCube <- array(data     = character(64), | ||||||
|                dim      = c(4, 4, 4), |                dim      = c(4, 4, 4), | ||||||
|                dimnames = list(c("A", "G", "C", "T"), |                dimnames = list(c("A", "G", "C", "T"), | ||||||
|                                c("A", "G", "C", "T"), |                                c("A", "G", "C", "T"), | ||||||
|                                c("A", "G", "C", "T"))) |                                c("A", "G", "C", "T"))) | ||||||
|  |  | ||||||
| # fill it with amino acid codes using three nested loops | # fill it with amino acid codes using three nested loops | ||||||
| for (i in 1:4) { | for (i in 1:4) { | ||||||
|   for (j in 1:4) { |   for (j in 1:4) { | ||||||
|     for (k in 1:4) { |     for (k in 1:4) { | ||||||
|       myCodon <- paste(dimnames(cCube)[[1]][i], |       myCodon <- paste(dimnames(cCube)[[1]][i], | ||||||
|                        dimnames(cCube)[[2]][j], |                        dimnames(cCube)[[2]][j], | ||||||
|                        dimnames(cCube)[[3]][k], |                        dimnames(cCube)[[3]][k], | ||||||
|                        sep = "", |                        sep = "", | ||||||
|                        collapse = "") |                        collapse = "") | ||||||
|       cCube[i, j, k] <- GC[myCodon] |       cCube[i, j, k] <- GC[myCodon] | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| # confirm | # confirm | ||||||
| cCube["A", "T", "G"] # methionine | cCube["A", "T", "G"] # methionine | ||||||
| cCube["T", "T", "T"] # phenylalanine | cCube["T", "T", "T"] # phenylalanine | ||||||
| cCube["T", "A", "G"] # stop (amber) | cCube["T", "A", "G"] # stop (amber) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.1  Print a Genetic code table  ======================================== | # ==   3.1  Print a Genetic code table  ======================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # The data structure of our cCube is well suited to print a table. In the | # The data structure of our cCube is well suited to print a table. In the | ||||||
| # "standard" way to print the genetic code, we write codons with the same | # "standard" way to print the genetic code, we write codons with the same | ||||||
| # second nucleotide in columns, and arrange rows in blocks of same | # second nucleotide in columns, and arrange rows in blocks of same | ||||||
| # first nucleotide, varying the third nucleotide fastest. This maximizes the | # first nucleotide, varying the third nucleotide fastest. This maximizes the | ||||||
| # similarity of adjacent amino acids in the table if we print the | # similarity of adjacent amino acids in the table if we print the | ||||||
| # nucleotides in the order T C A G. It's immidiately obvious that the code | # nucleotides in the order T C A G. It's immidiately obvious that the code | ||||||
| # is not random: the universal genetic code is exceptionally error tolerant in | # is not random: the universal genetic code is exceptionally error tolerant in | ||||||
| # the sense that mutations (or single-nucleotide translation errors) are likely | # the sense that mutations (or single-nucleotide translation errors) are likely | ||||||
| # to result in an amino acid with similar biophysical properties as the | # to result in an amino acid with similar biophysical properties as the | ||||||
| # original. | # original. | ||||||
|  |  | ||||||
| nuc <- c("T", "C", "A", "G") | nuc <- c("T", "C", "A", "G") | ||||||
|  |  | ||||||
| # (calling variables f, s, t to indicate first, second, and third position ...) | # (calling variables f, s, t to indicate first, second, and third position ...) | ||||||
| for (f in nuc) {      # first varies in blocks | for (f in nuc) {      # first varies in blocks | ||||||
|   for (t in nuc) {    # third varies in columns |   for (t in nuc) {    # third varies in columns | ||||||
|     for (s in nuc) {  # second varies in rows |     for (s in nuc) {  # second varies in rows | ||||||
|       cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t])) |       cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t])) | ||||||
|     } |     } | ||||||
|     cat("\n") |     cat("\n") | ||||||
|   } |   } | ||||||
|   cat("\n") |   cat("\n") | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Tasks  =============================================================== | # =    4  Tasks  =============================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: What do you need to change to print the table with U instead | # Task: What do you need to change to print the table with U instead | ||||||
| #         of T? Try it. | #         of T? Try it. | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: Point mutations are more often transitions (purine -> purine; | # Task: Point mutations are more often transitions (purine -> purine; | ||||||
| #         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine; | #         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine; | ||||||
| #         pyrimidine -> purine), even though twice as many transversions | #         pyrimidine -> purine), even though twice as many transversions | ||||||
| #         are possible in the code. This is most likely due a deamination / | #         are possible in the code. This is most likely due a deamination / | ||||||
| #         tautomerization process that favours C -> T changes. If the code | #         tautomerization process that favours C -> T changes. If the code | ||||||
| #         indeed minimizes the effect of mutations, you would expect that | #         indeed minimizes the effect of mutations, you would expect that | ||||||
| #         codons that differ by a transition code for more similar amino acids | #         codons that differ by a transition code for more similar amino acids | ||||||
| #         than codons that differ by a transversion. Is that true? List the set | #         than codons that differ by a transversion. Is that true? List the set | ||||||
| #         of all amino acid pairs that are encoded by codons with a C -> T | #         of all amino acid pairs that are encoded by codons with a C -> T | ||||||
| #         transition. Then list the set of amino acid pairs with a C -> A | #         transition. Then list the set of amino acid pairs with a C -> A | ||||||
| #         transversion. Which set of pairs is more similar? | #         transversion. Which set of pairs is more similar? | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: How many stop codons do the two mbp1-gene derived amino acid sequences | # Task: How many stop codons do the two mbp1-gene derived amino acid sequences | ||||||
| #         have if you translate them in the 2. or the 3. frame? | #         have if you translate them in the 2. or the 3. frame? | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: How does the amino acid composition change if you translate the mbp1 | # Task: How does the amino acid composition change if you translate the mbp1 | ||||||
| #         gene with the Alternative Yeast Nuclear code that is used by the | #         gene with the Alternative Yeast Nuclear code that is used by the | ||||||
| #         "GTC clade" of fungi? | #         "GTC clade" of fungi? | ||||||
| #         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code ) | #         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code ) | ||||||
|  |  | ||||||
| # Solution: | # Solution: | ||||||
|  |  | ||||||
|     # Fetch the code |     # Fetch the code | ||||||
|     Biostrings::GENETIC_CODE_TABLE |     Biostrings::GENETIC_CODE_TABLE | ||||||
|     Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"] |     Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"] | ||||||
|     altYcode <- Biostrings::getGeneticCode("12") |     altYcode <- Biostrings::getGeneticCode("12") | ||||||
|  |  | ||||||
|     # what's the difference? |     # what's the difference? | ||||||
|     (delta <- which(Biostrings::GENETIC_CODE != altYcode)) |     (delta <- which(Biostrings::GENETIC_CODE != altYcode)) | ||||||
|  |  | ||||||
|     Biostrings::GENETIC_CODE[delta] |     Biostrings::GENETIC_CODE[delta] | ||||||
|     altYcode[delta] |     altYcode[delta] | ||||||
|  |  | ||||||
|     # translate |     # translate | ||||||
|     altYAA <- character(834) |     altYAA <- character(834) | ||||||
|     for (i in seq_along(mbp1Codons)) { |     for (i in seq_along(mbp1Codons)) { | ||||||
|       altYAA[i] <- altYcode[mbp1Codons[i]] |       altYAA[i] <- altYcode[mbp1Codons[i]] | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     table(mbp1AA) |     table(mbp1AA) | ||||||
|     table(altYAA) |     table(altYAA) | ||||||
|  |  | ||||||
| # Task: The genetic code has significant redundacy, i.e. there are up to six | # Task: The genetic code has significant redundacy, i.e. there are up to six | ||||||
| #         codons that code for the same amino acid. Write code that lists how | #         codons that code for the same amino acid. Write code that lists how | ||||||
| #         many amino acids are present how often i.e. it should tell you that | #         many amino acids are present how often i.e. it should tell you that | ||||||
| #         two amino acids are encoded only with a single codon, three amino | #         two amino acids are encoded only with a single codon, three amino | ||||||
| #         acids have six codons, etc. Solution below, but don't peek. There | #         acids have six codons, etc. Solution below, but don't peek. There | ||||||
| #         are many possible ways to do this. | #         are many possible ways to do this. | ||||||
| # | # | ||||||
| # | # | ||||||
| # Solution: | # Solution: | ||||||
| ( x <- table(table(Biostrings::GENETIC_CODE)) ) | ( x <- table(table(Biostrings::GENETIC_CODE)) ) | ||||||
|  |  | ||||||
| # confirm | # confirm | ||||||
| sum(x * as.numeric(names(x))) | sum(x * as.numeric(names(x))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,224 +1,224 @@ | |||||||
| # tocID <- "FND-STA-Information_theory.R" | # tocID <- "FND-STA-Information_theory.R" | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the FND-STA-Information_theory unit. | #              R code accompanying the FND-STA-Information_theory unit. | ||||||
| # | # | ||||||
| # Version:  0.2.1 | # Version:  0.2.1 | ||||||
| # | # | ||||||
| # Date:     2017 - 2021 | # Date:     2017 - 2021 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           0.2.1  Maintenance | #           0.2.1  Maintenance | ||||||
| #           0.2    Under development | #           0.2    Under development | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                  Line | #TOC>   Section  Title                  Line | ||||||
| #TOC> -------------------------------------- | #TOC> -------------------------------------- | ||||||
| #TOC>   1        ___Section___            39 | #TOC>   1        ___Section___            39 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  ___Section___  ======================================================= | # =    1  ___Section___  ======================================================= | ||||||
|  |  | ||||||
| # What level of information is "significant" | # What level of information is "significant" | ||||||
|  |  | ||||||
| # Assume the background distribution is the database frequencies of | # Assume the background distribution is the database frequencies of | ||||||
| # amino acids: | # amino acids: | ||||||
|  |  | ||||||
| AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to | AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to | ||||||
| # sum to 1.0 | # sum to 1.0 | ||||||
| AAref["A"] <- 0.0904 | AAref["A"] <- 0.0904 | ||||||
| AAref["C"] <- 0.0123 | AAref["C"] <- 0.0123 | ||||||
| AAref["D"] <- 0.0545 | AAref["D"] <- 0.0545 | ||||||
| AAref["E"] <- 0.0617 | AAref["E"] <- 0.0617 | ||||||
| AAref["F"] <- 0.0394 | AAref["F"] <- 0.0394 | ||||||
| AAref["G"] <- 0.0724 | AAref["G"] <- 0.0724 | ||||||
| AAref["H"] <- 0.0221 | AAref["H"] <- 0.0221 | ||||||
| AAref["I"] <- 0.0573 | AAref["I"] <- 0.0573 | ||||||
| AAref["K"] <- 0.0504 | AAref["K"] <- 0.0504 | ||||||
| AAref["L"] <- 0.0986 | AAref["L"] <- 0.0986 | ||||||
| AAref["M"] <- 0.0240 | AAref["M"] <- 0.0240 | ||||||
| AAref["N"] <- 0.0392 | AAref["N"] <- 0.0392 | ||||||
| AAref["P"] <- 0.0486 | AAref["P"] <- 0.0486 | ||||||
| AAref["Q"] <- 0.0381 | AAref["Q"] <- 0.0381 | ||||||
| AAref["R"] <- 0.0570 | AAref["R"] <- 0.0570 | ||||||
| AAref["S"] <- 0.0673 | AAref["S"] <- 0.0673 | ||||||
| AAref["T"] <- 0.0558 | AAref["T"] <- 0.0558 | ||||||
| AAref["V"] <- 0.0686 | AAref["V"] <- 0.0686 | ||||||
| AAref["W"] <- 0.0129 | AAref["W"] <- 0.0129 | ||||||
| AAref["Y"] <- 0.0294 | AAref["Y"] <- 0.0294 | ||||||
| sum(AAref) | sum(AAref) | ||||||
|  |  | ||||||
| # Function to calculate Shannon entropy | # Function to calculate Shannon entropy | ||||||
| H <- function(pmf) { | H <- function(pmf) { | ||||||
|   # Calculate Shannon entropy |   # Calculate Shannon entropy | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #   pmf (numeric) probability mass function: a vector of states and |   #   pmf (numeric) probability mass function: a vector of states and | ||||||
|   #                 associated probabilities. Each element of |   #                 associated probabilities. Each element of | ||||||
|   #                 pmf must be in (0, 1] and sum(pmf) must be 1. |   #                 pmf must be in (0, 1] and sum(pmf) must be 1. | ||||||
|   # Value: |   # Value: | ||||||
|   #   Shannon entropy in bits. |   #   Shannon entropy in bits. | ||||||
|   # Examples: |   # Examples: | ||||||
|   #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random |   #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random | ||||||
|   #                                         # nucleotide sequence |   #                                         # nucleotide sequence | ||||||
|   #   H(1)     # If all elements are the same, entropy is zero |   #   H(1)     # If all elements are the same, entropy is zero | ||||||
|   # |   # | ||||||
|   if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) { |   if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) { | ||||||
|     stop("Input is not a discrete probability distribution.") |     stop("Input is not a discrete probability distribution.") | ||||||
|   } |   } | ||||||
|   H <- -sum(pmf * (log(pmf) / log(2))) |   H <- -sum(pmf * (log(pmf) / log(2))) | ||||||
|   return(H) |   return(H) | ||||||
| } | } | ||||||
|  |  | ||||||
| # Why use all.equal()? Exact comparisons with floating point numbers are | # Why use all.equal()? Exact comparisons with floating point numbers are | ||||||
| # brittle. Consider for example: | # brittle. Consider for example: | ||||||
| 1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1 | 1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1 | ||||||
| print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777 | print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777 | ||||||
| # all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8 | # all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8 | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Entropy of the database frequencies (in bits): | # Entropy of the database frequencies (in bits): | ||||||
| (Href <- H(AAref)) | (Href <- H(AAref)) | ||||||
|  |  | ||||||
| # for comparison: entropy if all amino acids are equiprobable | # for comparison: entropy if all amino acids are equiprobable | ||||||
| H(rep(0.05, 20)) | H(rep(0.05, 20)) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Set up a simulation to estimate the distribution of Information values | # Set up a simulation to estimate the distribution of Information values | ||||||
| # from random sequences drawn from AAref. This is the distribution for the | # from random sequences drawn from AAref. This is the distribution for the | ||||||
| # statistical null hypothesis: | # statistical null hypothesis: | ||||||
| nObs <- 15                      # number of observations (e.g aligned sequences) | nObs <- 15                      # number of observations (e.g aligned sequences) | ||||||
| # nObs <- 80 | # nObs <- 80 | ||||||
| nTrials <- 10000                # number of trials | nTrials <- 10000                # number of trials | ||||||
| IObs <- numeric(nTrials)        # vector to store Information in each trial | IObs <- numeric(nTrials)        # vector to store Information in each trial | ||||||
| simCounts <- numeric(20)        # vector to tabulate our information ... | simCounts <- numeric(20)        # vector to tabulate our information ... | ||||||
| names(simCounts) <- names(AAref)# ... with the names of AAref | names(simCounts) <- names(AAref)# ... with the names of AAref | ||||||
|  |  | ||||||
|  |  | ||||||
| for (i in 1:nTrials) {  # simulate ... | for (i in 1:nTrials) {  # simulate ... | ||||||
|  |  | ||||||
|   # sample AAref letters, nObs times, with the probabilities of AAref: |   # sample AAref letters, nObs times, with the probabilities of AAref: | ||||||
|   AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) |   AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) | ||||||
|  |  | ||||||
|   x <- table(AAobs)                            # table simulated observations |   x <- table(AAobs)                            # table simulated observations | ||||||
|   simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 |   simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 | ||||||
|   simCounts[names(x)] <- x                     # overwrite with observed counts |   simCounts[names(x)] <- x                     # overwrite with observed counts | ||||||
|   simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts |   simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts | ||||||
|   Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H |   Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H | ||||||
|   IObs[i] <- Href - Hobs                       # store information |   IObs[i] <- Href - Hobs                       # store information | ||||||
| } | } | ||||||
|  |  | ||||||
| # evaluate | # evaluate | ||||||
| hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25) | hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25) | ||||||
| abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC") | abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC") | ||||||
|  |  | ||||||
| # The purple lines are drawn at the 5% quantiles of the Iobs distributions - | # The purple lines are drawn at the 5% quantiles of the Iobs distributions - | ||||||
| # i.e. an actual observation that lies outside the purple lines is deemed | # i.e. an actual observation that lies outside the purple lines is deemed | ||||||
| # "significant"(1)(2). Of course, this is only true to the degree that the | # "significant"(1)(2). Of course, this is only true to the degree that the | ||||||
| # database frequencies are a valid model for the null-hypothesis on the | # database frequencies are a valid model for the null-hypothesis on the | ||||||
| # sequence position we are considering here. | # sequence position we are considering here. | ||||||
|  |  | ||||||
| #  (1) If we use 5% quantiles, this means a value is significantly larger | #  (1) If we use 5% quantiles, this means a value is significantly larger | ||||||
| #      than expected, and we ignore cases when the value is < 0; if we | #      than expected, and we ignore cases when the value is < 0; if we | ||||||
| #      consider both smaller and larger values, we need to use 2.5% quantiles, | #      consider both smaller and larger values, we need to use 2.5% quantiles, | ||||||
| #      since 5% of all observations lie outside the 0.025 and 0.975 | #      since 5% of all observations lie outside the 0.025 and 0.975 | ||||||
| #      quantiles. | #      quantiles. | ||||||
| # | # | ||||||
| #  (2) For an actual observation of counts, we calculate its observed | #  (2) For an actual observation of counts, we calculate its observed | ||||||
| #      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1). | #      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1). | ||||||
|  |  | ||||||
|  |  | ||||||
| # You can probably now appreciate that information is a bit of a shortcut for | # You can probably now appreciate that information is a bit of a shortcut for | ||||||
| # biological sequences, and does not really take the different inherent | # biological sequences, and does not really take the different inherent | ||||||
| # frequencies based on the character of the amino acids into account. For | # frequencies based on the character of the amino acids into account. For | ||||||
| # example, L is the most frequent and C is the least frequent, but if we have an | # example, L is the most frequent and C is the least frequent, but if we have an | ||||||
| # alignment of 1000 sequences and we see that the frequencies for L and C are | # alignment of 1000 sequences and we see that the frequencies for L and C are | ||||||
| # swapped, that would be _very_ surprising - nevertheless, the information would | # swapped, that would be _very_ surprising - nevertheless, the information would | ||||||
| # be 0. In order to take that into account, we should actually compute | # be 0. In order to take that into account, we should actually compute | ||||||
| # Kullback-Leibler divergences. | # Kullback-Leibler divergences. | ||||||
|  |  | ||||||
|  |  | ||||||
| # Swap C and L frequencies | # Swap C and L frequencies | ||||||
| p <- AAref | p <- AAref | ||||||
| q <- AAref | q <- AAref | ||||||
| q["L"] <- AAref["C"] | q["L"] <- AAref["C"] | ||||||
| q["C"] <- AAref["L"] | q["C"] <- AAref["L"] | ||||||
| H(p) | H(p) | ||||||
| H(q) | H(q) | ||||||
|  |  | ||||||
| KLdiv <- function(p, q) { | KLdiv <- function(p, q) { | ||||||
|   # p and q are two pmfs of discrete probability distributions |   # p and q are two pmfs of discrete probability distributions | ||||||
|   # with the same outcomes, which are nowhere 0. |   # with the same outcomes, which are nowhere 0. | ||||||
|   # Value:  Kullback-Leibler divergence  sum(p * log( p / q))). |   # Value:  Kullback-Leibler divergence  sum(p * log( p / q))). | ||||||
|  |  | ||||||
|   if (length(p) != length(q)) { |   if (length(p) != length(q)) { | ||||||
|     stop("PANIC: input vector lengths differ!") |     stop("PANIC: input vector lengths differ!") | ||||||
|   } |   } | ||||||
|   if (any(c((p == 0), (q == 0)))) { |   if (any(c((p == 0), (q == 0)))) { | ||||||
|     stop("PANIC: 0's found in input vectors!") |     stop("PANIC: 0's found in input vectors!") | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return(sum(p * log( p / q ))) |   return(sum(p * log( p / q ))) | ||||||
| } | } | ||||||
|  |  | ||||||
| KLdiv(p, p) | KLdiv(p, p) | ||||||
| KLdiv(p, q) | KLdiv(p, q) | ||||||
|  |  | ||||||
|  |  | ||||||
| nObs <- 15                      # number of observations (e.g aligned sequences) | nObs <- 15                      # number of observations (e.g aligned sequences) | ||||||
| # nObs <- 80 | # nObs <- 80 | ||||||
| nTrials <- 10000                # number of trials | nTrials <- 10000                # number of trials | ||||||
| KLdivObs <- numeric(nTrials)        # vector to store Information in each trial | KLdivObs <- numeric(nTrials)        # vector to store Information in each trial | ||||||
| simCounts <- numeric(20)        # vector to tabulate our information ... | simCounts <- numeric(20)        # vector to tabulate our information ... | ||||||
| names(simCounts) <- names(AAref)# ... with the names of AAref | names(simCounts) <- names(AAref)# ... with the names of AAref | ||||||
|  |  | ||||||
|  |  | ||||||
| for (i in 1:nTrials) {  # simulate ... | for (i in 1:nTrials) {  # simulate ... | ||||||
|  |  | ||||||
|   # sample AAref letters, nObs times, with the probabilities of AAref: |   # sample AAref letters, nObs times, with the probabilities of AAref: | ||||||
|   AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) |   AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE) | ||||||
|  |  | ||||||
|   x <- table(AAobs)                            # table simulated observations |   x <- table(AAobs)                            # table simulated observations | ||||||
|   simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 |   simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0 | ||||||
|   simCounts[names(x)] <- x                     # overwrite with observed counts |   simCounts[names(x)] <- x                     # overwrite with observed counts | ||||||
|   simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts |   simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts | ||||||
|   simCounts <- simCounts/sum(simCounts)        # counts to frequency |   simCounts <- simCounts/sum(simCounts)        # counts to frequency | ||||||
|   KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv |   KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv | ||||||
| } | } | ||||||
|  |  | ||||||
| # evaluate | # evaluate | ||||||
| hist(KLdivObs, col = "#C9F4E3", breaks = 25) | hist(KLdivObs, col = "#C9F4E3", breaks = 25) | ||||||
| abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC") | abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC") | ||||||
| quantile(KLdivObs, 0.992) | quantile(KLdivObs, 0.992) | ||||||
|  |  | ||||||
| # Running the simulation with KL does not give a fundamentally | # Running the simulation with KL does not give a fundamentally | ||||||
| # different behaviour - since we are just randomly sampling. But KL would be | # different behaviour - since we are just randomly sampling. But KL would be | ||||||
| # more sensitive in case there is biological selection, where the sampling is no | # more sensitive in case there is biological selection, where the sampling is no | ||||||
| # longer random. If I run the same simulation, with nObs <- 80 but calculating | # longer random. If I run the same simulation, with nObs <- 80 but calculating | ||||||
| # KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L | # KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L | ||||||
| # frequency swap gives me a KL divergence of 0.18 - this is significant at p = | # frequency swap gives me a KL divergence of 0.18 - this is significant at p = | ||||||
| # 0.008 - (remember, Information is 0 in this case). So that's actually quite a | # 0.008 - (remember, Information is 0 in this case). So that's actually quite a | ||||||
| # nice addition to the toolbox. | # nice addition to the toolbox. | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,351 +1,351 @@ | |||||||
| # tocID <- "FND-STA-Significance.R" | # tocID <- "FND-STA-Significance.R" | ||||||
| # | # | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the FND-STA-Significance unit. | #              R code accompanying the FND-STA-Significance unit. | ||||||
| # | # | ||||||
| # Version:  1.3 | # Version:  1.3 | ||||||
| # | # | ||||||
| # Date:     2017-09  - 2020-09 | # Date:     2017-09  - 2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.3    2020 Maintenance. Add sample solution. | #           1.3    2020 Maintenance. Add sample solution. | ||||||
| #           1.2    Update set.seed() usage | #           1.2    Update set.seed() usage | ||||||
| #           1.1    Corrected treatment of empirical p-value | #           1.1    Corrected treatment of empirical p-value | ||||||
| #           1.0    First contents | #           1.0    First contents | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                              Line | #TOC>   Section  Title                                              Line | ||||||
| #TOC> ------------------------------------------------------------------ | #TOC> ------------------------------------------------------------------ | ||||||
| #TOC>   1        Significance and p-value                             49 | #TOC>   1        Significance and p-value                             49 | ||||||
| #TOC>   1.1        Significance levels                                60 | #TOC>   1.1        Significance levels                                60 | ||||||
| #TOC>   1.2        probability and p-value                            77 | #TOC>   1.2        probability and p-value                            77 | ||||||
| #TOC>   1.2.1          p-value illustrated                           109 | #TOC>   1.2.1          p-value illustrated                           109 | ||||||
| #TOC>   2        One- or two-sided                                   165 | #TOC>   2        One- or two-sided                                   165 | ||||||
| #TOC>   3        Significance by integration                         209 | #TOC>   3        Significance by integration                         209 | ||||||
| #TOC>   4        Significance by simulation or permutation           215 | #TOC>   4        Significance by simulation or permutation           215 | ||||||
| #TOC>   5        Final tasks                                         327 | #TOC>   5        Final tasks                                         327 | ||||||
| #TOC>   6        Sample solutions                                    336 | #TOC>   6        Sample solutions                                    336 | ||||||
| #TOC>   6.1                                                          338 | #TOC>   6.1                                                          338 | ||||||
| #TOC>   6.2                                                          342 | #TOC>   6.2                                                          342 | ||||||
| #TOC>   6.3                                                          346 | #TOC>   6.3                                                          346 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Significance and p-value  ============================================ | # =    1  Significance and p-value  ============================================ | ||||||
|  |  | ||||||
| # The idea of the probability of an event has a precise mathematical | # The idea of the probability of an event has a precise mathematical | ||||||
| # interpretation, but how is it useful to know the probability? Usually we are | # interpretation, but how is it useful to know the probability? Usually we are | ||||||
| # interested in whether we should accept or reject a hypothesis based on the | # interested in whether we should accept or reject a hypothesis based on the | ||||||
| # observations we have. A rational way to do this is to say: if the probability | # observations we have. A rational way to do this is to say: if the probability | ||||||
| # of observing the data is very small under the null-hypothesis, then we will | # of observing the data is very small under the null-hypothesis, then we will | ||||||
| # assume the observation is due to something other than the null-hypothesis. But | # assume the observation is due to something other than the null-hypothesis. But | ||||||
| # what do we mean by the "probability of our observation"? And what is "very | # what do we mean by the "probability of our observation"? And what is "very | ||||||
| # small"? | # small"? | ||||||
|  |  | ||||||
| # ==   1.1  Significance levels  =============================================== | # ==   1.1  Significance levels  =============================================== | ||||||
|  |  | ||||||
| # A "very small" probability is purely a matter of convention - a cultural | # A "very small" probability is purely a matter of convention - a cultural | ||||||
| # convention. In the biomedical field we usually call probabilities of less then | # convention. In the biomedical field we usually call probabilities of less then | ||||||
| # 0.05 (5%) small enough to reject the null-hypothesis. Thus we call | # 0.05 (5%) small enough to reject the null-hypothesis. Thus we call | ||||||
| # observations with a probability of less than 0.05 "significant" and if we want | # observations with a probability of less than 0.05 "significant" and if we want | ||||||
| # to highlight this in text or in a graph, we often mark them with an asterisk | # to highlight this in text or in a graph, we often mark them with an asterisk | ||||||
| # (*). Also we often call observations with a probability of less than 0.01 | # (*). Also we often call observations with a probability of less than 0.01 | ||||||
| # "highly significant" and mark them with two asterisks (**). But there is no | # "highly significant" and mark them with two asterisks (**). But there is no | ||||||
| # special significance in these numbers, the cutoff point for significance could | # special significance in these numbers, the cutoff point for significance could | ||||||
| # also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the | # also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the | ||||||
| # British statistician Ronald Fisher happened to propose for this purpose in | # British statistician Ronald Fisher happened to propose for this purpose in | ||||||
| # 1925. Incidentally, Fisher later recommended to use different cutoffs for | # 1925. Incidentally, Fisher later recommended to use different cutoffs for | ||||||
| # different purposes (cf. | # different purposes (cf. | ||||||
| # https://en.wikipedia.org/wiki/Statistical_significance). | # https://en.wikipedia.org/wiki/Statistical_significance). | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.2  probability and p-value  =========================================== | # ==   1.2  probability and p-value  =========================================== | ||||||
|  |  | ||||||
| # But what do we even mean by the probability of an observation? | # But what do we even mean by the probability of an observation? | ||||||
| # Assume I am drawing samples from a normal distribution with a mean of 0 and a | # Assume I am drawing samples from a normal distribution with a mean of 0 and a | ||||||
| # standard deviation of 1. The sample I get is ... | # standard deviation of 1. The sample I get is ... | ||||||
|  |  | ||||||
| set.seed(sqrt(5)) | set.seed(sqrt(5)) | ||||||
| x <- rnorm(1) | x <- rnorm(1) | ||||||
| set.seed(NULL) | set.seed(NULL) | ||||||
|  |  | ||||||
| print(x, digits = 22) | print(x, digits = 22) | ||||||
| # [1] -0.8969145466249813791748 | # [1] -0.8969145466249813791748 | ||||||
|  |  | ||||||
| # So what's the probability of that number? Obviously, the probability of | # So what's the probability of that number? Obviously, the probability of | ||||||
| # getting exactly this number is very, very, very small. But also obviously, | # getting exactly this number is very, very, very small. But also obviously, | ||||||
| # this does not mean that observing this number is in any way significant - we | # this does not mean that observing this number is in any way significant - we | ||||||
| # always observe some number. That's not what we mean in this case. There are | # always observe some number. That's not what we mean in this case. There are | ||||||
| # several implicit assumptions when we speak of the probability of an | # several implicit assumptions when we speak of the probability of an | ||||||
| # observation: | # observation: | ||||||
|  |  | ||||||
| # 1: the observation can be compared to a probability distribution; | # 1: the observation can be compared to a probability distribution; | ||||||
| # 2: that distribution can be integrated between any specific value | # 2: that distribution can be integrated between any specific value | ||||||
| #      and its upper and lower bounds (or +- infinity). | #      and its upper and lower bounds (or +- infinity). | ||||||
|  |  | ||||||
| # Then what we really mean by the probability of an observation in the context | # Then what we really mean by the probability of an observation in the context | ||||||
| # of that distribution is: the probability of observing that value, or a value | # of that distribution is: the probability of observing that value, or a value | ||||||
| # more extreme than the one we have. We call this the p-value. Note that we are | # more extreme than the one we have. We call this the p-value. Note that we are | ||||||
| # not talking about an individual number anymore, we are talking about the area | # not talking about an individual number anymore, we are talking about the area | ||||||
| # under the curve between our observation and the upper (or lower) bound of the | # under the curve between our observation and the upper (or lower) bound of the | ||||||
| # curve, as a fraction of the whole. | # curve, as a fraction of the whole. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ===   1.2.1  p-value illustrated                       | # ===   1.2.1  p-value illustrated                       | ||||||
|  |  | ||||||
| # Let's illustrate. First we draw a million random values from our | # Let's illustrate. First we draw a million random values from our | ||||||
| # standard, normal distribution: | # standard, normal distribution: | ||||||
|  |  | ||||||
| N <- 1e6                             # one million | N <- 1e6                             # one million | ||||||
| set.seed(112358)                     # set RNG seed for repeatable randomness | set.seed(112358)                     # set RNG seed for repeatable randomness | ||||||
| r <- rnorm(N)                        # N values from a normal distribution | r <- rnorm(N)                        # N values from a normal distribution | ||||||
| set.seed(NULL)                       # reset the RNG | set.seed(NULL)                       # reset the RNG | ||||||
|  |  | ||||||
| # Let's see what the distribution looks like: | # Let's see what the distribution looks like: | ||||||
|  |  | ||||||
| (h <- hist(r)) | (h <- hist(r)) | ||||||
|  |  | ||||||
| # The histogram details are now available in the list h -  e.g. h$counts | # The histogram details are now available in the list h -  e.g. h$counts | ||||||
|  |  | ||||||
| # Where is the value we have drawn previously? | # Where is the value we have drawn previously? | ||||||
| abline(v = x, col = "#EE0000") | abline(v = x, col = "#EE0000") | ||||||
|  |  | ||||||
| # How many values are smaller? | # How many values are smaller? | ||||||
| sum(r < x) | sum(r < x) | ||||||
|  |  | ||||||
| # Let's color the bars: | # Let's color the bars: | ||||||
| #    first, make a vector of red and green colors for the bars with breaks | #    first, make a vector of red and green colors for the bars with breaks | ||||||
| #    smaller and larger then x, white for the bar that contains x ... | #    smaller and larger then x, white for the bar that contains x ... | ||||||
| hCol <- rep("#EE000044", sum(h$breaks < x) - 1) | hCol <- rep("#EE000044", sum(h$breaks < x) - 1) | ||||||
| hCol <- c(hCol, "#FFFFFFFF") | hCol <- c(hCol, "#FFFFFFFF") | ||||||
| hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1)) | hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1)) | ||||||
| # ... then plot the histogram, with colored bars ... | # ... then plot the histogram, with colored bars ... | ||||||
| hist(r, col = hCol) | hist(r, col = hCol) | ||||||
| # ... add two colored rectangles into the white bar ... | # ... add two colored rectangles into the white bar ... | ||||||
| idx <- sum(h$breaks < x) | idx <- sum(h$breaks < x) | ||||||
| xMin <- h$breaks[idx] | xMin <- h$breaks[idx] | ||||||
| xMax <- h$breaks[idx + 1] | xMax <- h$breaks[idx + 1] | ||||||
| y <- h$counts[idx] | y <- h$counts[idx] | ||||||
| rect(xMin, 0, x, y, col = "#EE000044", border = TRUE) | rect(xMin, 0, x, y, col = "#EE000044", border = TRUE) | ||||||
| rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE) | rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE) | ||||||
| # ... and a red line for our observation. | # ... and a red line for our observation. | ||||||
| abline(v = x, col = "#EE0000", lwd = 2) | abline(v = x, col = "#EE0000", lwd = 2) | ||||||
|  |  | ||||||
| # The p-value of our observation is the red area as a fraction of the | # The p-value of our observation is the red area as a fraction of the | ||||||
| # whole histogram (red + green). | # whole histogram (red + green). | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| #    Explain how the expression sum(r < x) works to give us a count of values | #    Explain how the expression sum(r < x) works to give us a count of values | ||||||
| #    with the property we are looking for. E.g., examine -4:4 < x | #    with the property we are looking for. E.g., examine -4:4 < x | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| #    Write an expression to estimate the probability that a value | #    Write an expression to estimate the probability that a value | ||||||
| #    drawn from the vector r is less-or-equal to x. The result you get | #    drawn from the vector r is less-or-equal to x. The result you get | ||||||
| #    will depend on the exact values that went into the vector r but it should | #    will depend on the exact values that went into the vector r but it should | ||||||
| #    be close to 0.185  That expression is the p-value associated with x. | #    be close to 0.185  That expression is the p-value associated with x. | ||||||
| #    (Sample solution 6.1) | #    (Sample solution 6.1) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  One- or two-sided  =================================================== | # =    2  One- or two-sided  =================================================== | ||||||
|  |  | ||||||
| # The shape of our histogram confirms that the rnorm() function has returned | # The shape of our histogram confirms that the rnorm() function has returned | ||||||
| # values that appear distributed according to a normal distribution. In a normal | # values that appear distributed according to a normal distribution. In a normal | ||||||
| # distribution, readily available tables tell us that 5% of the values (i.e. our | # distribution, readily available tables tell us that 5% of the values (i.e. our | ||||||
| # significance level) lie 1.96 (or approximately 2) standard deviations away | # significance level) lie 1.96 (or approximately 2) standard deviations away | ||||||
| # from the mean. Is this the case here? How many values in our vector r are | # from the mean. Is this the case here? How many values in our vector r are | ||||||
| # larger than 1.96? | # larger than 1.96? | ||||||
|  |  | ||||||
| sum(r > 1.96) | sum(r > 1.96) | ||||||
| # [1] 24589 | # [1] 24589 | ||||||
|  |  | ||||||
| # Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why? | # Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why? | ||||||
|  |  | ||||||
| # The answer is: we have to be careful with two-sided distributions. 2 standard | # The answer is: we have to be careful with two-sided distributions. 2 standard | ||||||
| # deviations away from the mean means either larger or smaller than 1.96 . This | # deviations away from the mean means either larger or smaller than 1.96 . This | ||||||
| # can give rise to errors. If we are simply are interested in outliers, no | # can give rise to errors. If we are simply are interested in outliers, no | ||||||
| # matter larger or smaller, then the 1.96 SD cutoff for significance is correct. | # matter larger or smaller, then the 1.96 SD cutoff for significance is correct. | ||||||
| # But if we are specifically interested in, say, larger values, because a | # But if we are specifically interested in, say, larger values, because a | ||||||
| # smaller value is not meaningful, then the significance cutoff, expressed as | # smaller value is not meaningful, then the significance cutoff, expressed as | ||||||
| # standard deviations, is relaxed. We can use the quantile function to see what | # standard deviations, is relaxed. We can use the quantile function to see what | ||||||
| # the cutoff values are: | # the cutoff values are: | ||||||
|  |  | ||||||
| quantile(r) | quantile(r) | ||||||
| quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries | quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries | ||||||
| # close to ± 1.96, as expected | # close to ± 1.96, as expected | ||||||
| quantile(r, probs = 0.95) # for the single 5% boundary | quantile(r, probs = 0.95) # for the single 5% boundary | ||||||
| # close to 1.64 . Check counts to confirm: | # close to 1.64 . Check counts to confirm: | ||||||
| sum(r > quantile(r, probs = 0.95)) | sum(r > quantile(r, probs = 0.95)) | ||||||
| # [1] 50000 | # [1] 50000 | ||||||
| # which is 5%, as expected. | # which is 5%, as expected. | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| # Use abline() to add the p = 0.05 boundary for smaller values to the histogram. | # Use abline() to add the p = 0.05 boundary for smaller values to the histogram. | ||||||
| # (Sample solution 6.2) | # (Sample solution 6.2) | ||||||
|  |  | ||||||
| # To summarize: when we evaluate the significance of an event, we divide a | # To summarize: when we evaluate the significance of an event, we divide a | ||||||
| # probability distribution into two parts at the point where the event was | # probability distribution into two parts at the point where the event was | ||||||
| # observed. We then ask whether the integral over the more extreme part is less | # observed. We then ask whether the integral over the more extreme part is less | ||||||
| # or more than 5% of the whole. If it is less, we deem the event to be | # or more than 5% of the whole. If it is less, we deem the event to be | ||||||
| # significant. | # significant. | ||||||
| # | # | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Significance by integration  ========================================= | # =    3  Significance by integration  ========================================= | ||||||
|  |  | ||||||
| # If the underlying probability distribution can be analytically or numerically | # If the underlying probability distribution can be analytically or numerically | ||||||
| # integrated, the siginificance of an observation can be directly computed. | # integrated, the siginificance of an observation can be directly computed. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Significance by simulation or permutation  =========================== | # =    4  Significance by simulation or permutation  =========================== | ||||||
|  |  | ||||||
| # But whether the integration is correct, or relies on assumptions that may not | # But whether the integration is correct, or relies on assumptions that may not | ||||||
| # be warranted for biological data, can be a highly technical question. | # be warranted for biological data, can be a highly technical question. | ||||||
| # Fortunately, we can often simply run a simulation, a random resampling, or a | # Fortunately, we can often simply run a simulation, a random resampling, or a | ||||||
| # permutation and then count the number of outcomes, just as we did with our | # permutation and then count the number of outcomes, just as we did with our | ||||||
| # rnorm() samples. We call this an empirical p-value. (Actually, the "empirical | # rnorm() samples. We call this an empirical p-value. (Actually, the "empirical | ||||||
| # p-value" is defined as (Nobs + 1) / (N + 1).  ) | # p-value" is defined as (Nobs + 1) / (N + 1).  ) | ||||||
|  |  | ||||||
| # Here is an example. Assume you have a protein sequence and | # Here is an example. Assume you have a protein sequence and | ||||||
| # you speculate that positively charged residues are close to negatively charged | # you speculate that positively charged residues are close to negatively charged | ||||||
| # residues to balance charge locally. A statistic that would capture this is the | # residues to balance charge locally. A statistic that would capture this is the | ||||||
| # mean minimum distance between all D,E residues and the closest R,K,H | # mean minimum distance between all D,E residues and the closest R,K,H | ||||||
| # residue. Let's compute this for the sequence of yeast Mbp1. | # residue. Let's compute this for the sequence of yeast Mbp1. | ||||||
|  |  | ||||||
| MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK", | MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK", | ||||||
|                "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA", |                "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA", | ||||||
|                "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR", |                "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR", | ||||||
|                "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ", |                "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ", | ||||||
|                "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS", |                "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS", | ||||||
|                "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY", |                "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY", | ||||||
|                "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", |                "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", | ||||||
|                "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP", |                "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP", | ||||||
|                "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT", |                "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT", | ||||||
|                "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP", |                "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP", | ||||||
|                "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK", |                "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK", | ||||||
|                "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR", |                "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR", | ||||||
|                "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK", |                "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK", | ||||||
|                "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA") |                "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA") | ||||||
|  |  | ||||||
| # first we split this string into individual characters: | # first we split this string into individual characters: | ||||||
| v <- unlist(strsplit(MBP1, "")) | v <- unlist(strsplit(MBP1, "")) | ||||||
|  |  | ||||||
| # and find the positions of our charged residues | # and find the positions of our charged residues | ||||||
|  |  | ||||||
| ED  <- grep("[ED]", v) | ED  <- grep("[ED]", v) | ||||||
| RKH <- grep("[RKH]", v) | RKH <- grep("[RKH]", v) | ||||||
|  |  | ||||||
| sep <- numeric(length(ED)) # this vector will hold the distances | sep <- numeric(length(ED)) # this vector will hold the distances | ||||||
| for (i in seq_along(ED)) { | for (i in seq_along(ED)) { | ||||||
|   sep[i] <- min(abs(RKH - ED[i])) |   sep[i] <- min(abs(RKH - ED[i])) | ||||||
| } | } | ||||||
|  |  | ||||||
| # Task: read and explain this bit of code | # Task: read and explain this bit of code | ||||||
|  |  | ||||||
| # Now that sep is computed, what does it look like? | # Now that sep is computed, what does it look like? | ||||||
|  |  | ||||||
| table(sep)  # these are the minimum distances | table(sep)  # these are the minimum distances | ||||||
| # 24 of D,E residues are adjacent to R,K,H; | # 24 of D,E residues are adjacent to R,K,H; | ||||||
| # the longest separation is 28 residues. | # the longest separation is 28 residues. | ||||||
|  |  | ||||||
| # What is the mean separation? | # What is the mean separation? | ||||||
| mean(sep) | mean(sep) | ||||||
|  |  | ||||||
| # The value is 4.1 . Is this significant? Honestly, I would be hard pressed | # The value is 4.1 . Is this significant? Honestly, I would be hard pressed | ||||||
| # to solve this analytically. But by permutation it's soooo easy. | # to solve this analytically. But by permutation it's soooo easy. | ||||||
|  |  | ||||||
| # First, we combine what we have done above into a function: | # First, we combine what we have done above into a function: | ||||||
|  |  | ||||||
| chSep <- function(v) { | chSep <- function(v) { | ||||||
|   # computes the mean minimum separation of oppositely charged residues |   # computes the mean minimum separation of oppositely charged residues | ||||||
|   # Parameter: v (char) a vector of amino acids in the one-letter code |   # Parameter: v (char) a vector of amino acids in the one-letter code | ||||||
|   # Value: msep (numeric) mean minimum separation |   # Value: msep (numeric) mean minimum separation | ||||||
|  |  | ||||||
|   ED  <- grep("[EDed]", v) |   ED  <- grep("[EDed]", v) | ||||||
|   RKH <- grep("[RKHrkh]", v) |   RKH <- grep("[RKHrkh]", v) | ||||||
|  |  | ||||||
|   sep <- numeric(length(ED)) |   sep <- numeric(length(ED)) | ||||||
|   for (i in seq_along(ED)) { |   for (i in seq_along(ED)) { | ||||||
|     sep[i] <- min(abs(RKH - ED[i])) |     sep[i] <- min(abs(RKH - ED[i])) | ||||||
|   } |   } | ||||||
|   return(mean(sep)) |   return(mean(sep)) | ||||||
| } | } | ||||||
|  |  | ||||||
| # Execute the function to define it. | # Execute the function to define it. | ||||||
|  |  | ||||||
| # Confirm that the function gives the same result as the number we | # Confirm that the function gives the same result as the number we | ||||||
| # calculated above: | # calculated above: | ||||||
| chSep(v) | chSep(v) | ||||||
|  |  | ||||||
| # Now we can produce a random permutation of v, and recalculate | # Now we can produce a random permutation of v, and recalculate | ||||||
|  |  | ||||||
| set.seed(pi)                       # set RNG seed for repeatable randomness | set.seed(pi)                       # set RNG seed for repeatable randomness | ||||||
| w <- sample(v, length(v))          # This shuffles the vector v. Memorize this | w <- sample(v, length(v))          # This shuffles the vector v. Memorize this | ||||||
|                                    # code paradigm. It is very useful. |                                    # code paradigm. It is very useful. | ||||||
| set.seed(NULL)                     # reset the RNG | set.seed(NULL)                     # reset the RNG | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| chSep(w) | chSep(w) | ||||||
| # 3.773 ... that's actually less than what we had before. | # 3.773 ... that's actually less than what we had before. | ||||||
|  |  | ||||||
| # Let's do this 10000 times and record the results (takes a few seconds): | # Let's do this 10000 times and record the results (takes a few seconds): | ||||||
|  |  | ||||||
| N <- 10000 | N <- 10000 | ||||||
| chs <- numeric(N) | chs <- numeric(N) | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   chs[i] <- chSep(sample(v, length(v))) # charge |   chs[i] <- chSep(sample(v, length(v))) # charge | ||||||
| } | } | ||||||
|  |  | ||||||
| hist(chs, breaks = 50) | hist(chs, breaks = 50) | ||||||
| abline(v = chSep(v), col = "#EE0000") | abline(v = chSep(v), col = "#EE0000") | ||||||
|  |  | ||||||
| # Contrary to our expectations, the actual observed mean minimum charge | # Contrary to our expectations, the actual observed mean minimum charge | ||||||
| # separation seems to be larger than what we observe in randomly permuted | # separation seems to be larger than what we observe in randomly permuted | ||||||
| # sequences. But is this significant? Your task to find out. | # sequences. But is this significant? Your task to find out. | ||||||
|  |  | ||||||
| # Task: | # Task: | ||||||
| # Calculate the empirical p-value for chsep(v) | # Calculate the empirical p-value for chsep(v) | ||||||
| # (Sample solution 6.3) | # (Sample solution 6.3) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    5  Final tasks  ========================================================= | # =    5  Final tasks  ========================================================= | ||||||
|  |  | ||||||
| # From chs, compute the empirical p-value of a mean minimum charge separation to | # From chs, compute the empirical p-value of a mean minimum charge separation to | ||||||
| #   be larger or equal to the value observed for the yeast MBP1 sequence. Note | #   be larger or equal to the value observed for the yeast MBP1 sequence. Note | ||||||
| #   the result in your journal. Is it significant? Also note the result of | #   the result in your journal. Is it significant? Also note the result of | ||||||
| #   the following expression for validation: | #   the following expression for validation: | ||||||
| seal(sum(chs)) | seal(sum(chs)) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    6  Sample solutions  ==================================================== | # =    6  Sample solutions  ==================================================== | ||||||
|  |  | ||||||
| # ==   6.1    ================================================================== | # ==   6.1    ================================================================== | ||||||
| # | # | ||||||
| sum(r <= x) / length(r) | sum(r <= x) / length(r) | ||||||
|  |  | ||||||
| # ==   6.2    ================================================================== | # ==   6.2    ================================================================== | ||||||
| # | # | ||||||
| abline(v = quantile(r, probs = c(0.05))) | abline(v = quantile(r, probs = c(0.05))) | ||||||
|  |  | ||||||
| # ==   6.3    ================================================================== | # ==   6.3    ================================================================== | ||||||
| # | # | ||||||
| ( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) ) | ( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) ) | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,3 +1,3 @@ | |||||||
| # BCH441-WORK-ABC-units | # BCH441-WORK-ABC-units | ||||||
|  |  | ||||||
| This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date. | This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date. | ||||||
							
								
								
									
										490
									
								
								RPR-Biostrings.R
									
									
									
									
									
								
							
							
						
						
									
										490
									
								
								RPR-Biostrings.R
									
									
									
									
									
								
							| @@ -1,245 +1,245 @@ | |||||||
| # tocID <- "RPR-Biostrings.R" | # tocID <- "RPR-Biostrings.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Biostrings unit. | #              R code accompanying the RPR-Biostrings unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2020-09 | # Date:     2017-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Updates | #           1.2    2020 Updates | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #                      use Biocmanager:: not biocLite() | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0    2017 Revisions | #           1.0    2017 Revisions | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                             Line | #TOC>   Section  Title                                             Line | ||||||
| #TOC> ----------------------------------------------------------------- | #TOC> ----------------------------------------------------------------- | ||||||
| #TOC>   1        The Biostrings:: Package                            56 | #TOC>   1        The Biostrings:: Package                            56 | ||||||
| #TOC>   2        Getting Data into Biostrings:: Objects              88 | #TOC>   2        Getting Data into Biostrings:: Objects              88 | ||||||
| #TOC>   3        Working with Biostrings:: Objects                  110 | #TOC>   3        Working with Biostrings:: Objects                  110 | ||||||
| #TOC>   3.1        Properties                                       127 | #TOC>   3.1        Properties                                       127 | ||||||
| #TOC>   3.2        Subsetting                                       168 | #TOC>   3.2        Subsetting                                       168 | ||||||
| #TOC>   3.3        Operators                                        180 | #TOC>   3.3        Operators                                        180 | ||||||
| #TOC>   3.4        Transformations                                  187 | #TOC>   3.4        Transformations                                  187 | ||||||
| #TOC>   4        Getting Data out of Biostrings:: Objects           194 | #TOC>   4        Getting Data out of Biostrings:: Objects           194 | ||||||
| #TOC>   5        More                                               203 | #TOC>   5        More                                               203 | ||||||
| #TOC>   5.1        Views                                            205 | #TOC>   5.1        Views                                            205 | ||||||
| #TOC>   5.2        Iranges                                          219 | #TOC>   5.2        Iranges                                          219 | ||||||
| #TOC>   5.3        StringSets                                       225 | #TOC>   5.3        StringSets                                       225 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # This is a very brief introduction to the Biostrings:: package, other units will | # This is a very brief introduction to the Biostrings:: package, other units will | ||||||
| # be using more of the Biostrings:: functions. | # be using more of the Biostrings:: functions. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  The Biostrings:: Package  ============================================ | # =    1  The Biostrings:: Package  ============================================ | ||||||
|  |  | ||||||
|  |  | ||||||
| # First, we install and load the Biostrings:: package from bioconductor (if we | # First, we install and load the Biostrings:: package from bioconductor (if we | ||||||
| # haven't done so already). | # haven't done so already). | ||||||
|  |  | ||||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   BiocManager::install("Biostrings") |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Examine the package information: | # Examine the package information: | ||||||
| library(help = Biostrings)       # basic information | library(help = Biostrings)       # basic information | ||||||
| browseVignettes("Biostrings")    # available vignettes | browseVignettes("Biostrings")    # available vignettes | ||||||
| data(package = "Biostrings")     # available datasets | data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # At its core, Biostrings:: objects are "classes" of type XString (you can think | # At its core, Biostrings:: objects are "classes" of type XString (you can think | ||||||
| # of a "class" in R as a special kind of list), that can take on particular | # of a "class" in R as a special kind of list), that can take on particular | ||||||
| # flavours for RNA, DNA or amino acid sequence information. | # flavours for RNA, DNA or amino acid sequence information. | ||||||
|  |  | ||||||
| class(Biostrings::RNAString("AUG")) | class(Biostrings::RNAString("AUG")) | ||||||
| class(Biostrings::DNAString("ATG")) | class(Biostrings::DNAString("ATG")) | ||||||
| class(Biostrings::AAString("M")) | class(Biostrings::AAString("M")) | ||||||
|  |  | ||||||
| # An essential property of Biostrings:: objects is that they only allow letters | # An essential property of Biostrings:: objects is that they only allow letters | ||||||
| # from the applicable IUPAC alphabet: | # from the applicable IUPAC alphabet: | ||||||
| Biostrings::RNAString("AUG") | Biostrings::RNAString("AUG") | ||||||
| Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes | Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Getting Data into Biostrings:: Objects  ============================== | # =    2  Getting Data into Biostrings:: Objects  ============================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Example: read FASTA. Extract sequence. Convert to DNAString object. | # Example: read FASTA. Extract sequence. Convert to DNAString object. | ||||||
| rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||||
| rawSeq <- dbSanitizeSequence(rawSeq) | rawSeq <- dbSanitizeSequence(rawSeq) | ||||||
| biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence | biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence | ||||||
|                                             # into an object of class DNAstring |                                             # into an object of class DNAstring | ||||||
|  |  | ||||||
| # Multi FASTA files can be read directly as a "XStringSet) ... | # Multi FASTA files can be read directly as a "XStringSet) ... | ||||||
| rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa" | rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa" | ||||||
| (biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile)) | (biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile)) | ||||||
|  |  | ||||||
| # ... and if you subset one sequence from the set, you get an XString object | # ... and if you subset one sequence from the set, you get an XString object | ||||||
| # back again. | # back again. | ||||||
| (Xseq <- biosDNASet[[1]]) | (Xseq <- biosDNASet[[1]]) | ||||||
|  |  | ||||||
| biosDNAseq == Xseq           # the comparison evaluates to TRUE ... | biosDNAseq == Xseq           # the comparison evaluates to TRUE ... | ||||||
| identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical. | identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Working with Biostrings:: Objects  =================================== | # =    3  Working with Biostrings:: Objects  =================================== | ||||||
|  |  | ||||||
| # Biostrings:: is a highly engineered package that is tightly integrated into | # Biostrings:: is a highly engineered package that is tightly integrated into | ||||||
| # the Bioconductor world - unfortunately that brings with it a somewhat | # the Bioconductor world - unfortunately that brings with it a somewhat | ||||||
| # undesirable level of computational overhead and dependencies. Using the | # undesirable level of computational overhead and dependencies. Using the | ||||||
| # package as we normally do - i.e. calling required functions with their | # package as we normally do - i.e. calling required functions with their | ||||||
| # explicit package prefix is therefore not advisable. There are generics | # explicit package prefix is therefore not advisable. There are generics | ||||||
| # that won't be propery dispatched. If you only need a small number of | # that won't be propery dispatched. If you only need a small number of | ||||||
| # functions for a very specific context, you will probably get away with | # functions for a very specific context, you will probably get away with | ||||||
| # Biostrings::<function>() - but even in the demonstration code of this script | # Biostrings::<function>() - but even in the demonstration code of this script | ||||||
| # not everything works out of the box. We'll therefore load the library, | # not everything works out of the box. We'll therefore load the library, | ||||||
| # but we'll (redundantly) use the prefix anyway so as to emphasize where | # but we'll (redundantly) use the prefix anyway so as to emphasize where | ||||||
| # the functions come from. | # the functions come from. | ||||||
|  |  | ||||||
| library(Biostrings) | library(Biostrings) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.1  Properties  ======================================================== | # ==   3.1  Properties  ======================================================== | ||||||
| str(rawSeq) | str(rawSeq) | ||||||
| str(biosDNAseq) | str(biosDNAseq) | ||||||
|  |  | ||||||
| length(rawSeq)       # ... is 1: one string only. To get the number of | length(rawSeq)       # ... is 1: one string only. To get the number of | ||||||
|                      # characters in a string, you need nchar(). |                      # characters in a string, you need nchar(). | ||||||
| length(biosDNAseq)   # but the length of a "Bstring" is the number of elements | length(biosDNAseq)   # but the length of a "Bstring" is the number of elements | ||||||
| nchar(rawSeq) | nchar(rawSeq) | ||||||
| nchar(biosDNAseq)    # ... but nchar() works too. | nchar(biosDNAseq)    # ... but nchar() works too. | ||||||
|  |  | ||||||
| (uL <- Biostrings::uniqueLetters(biosDNAseq)) | (uL <- Biostrings::uniqueLetters(biosDNAseq)) | ||||||
|  |  | ||||||
| # Count frequencies - with strings, you would strsplit() into a character | # Count frequencies - with strings, you would strsplit() into a character | ||||||
| # vector and then use table(). biost | # vector and then use table(). biost | ||||||
| Biostrings::alphabetFrequency(biosDNAseq) | Biostrings::alphabetFrequency(biosDNAseq) | ||||||
|  |  | ||||||
| # letterFrequency() works with a defined alphabet - such as what uniqueLetters() | # letterFrequency() works with a defined alphabet - such as what uniqueLetters() | ||||||
| # returns. | # returns. | ||||||
| Biostrings::letterFrequency(biosDNAseq, uL) | Biostrings::letterFrequency(biosDNAseq, uL) | ||||||
| sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) / | sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) / | ||||||
|   length(biosDNAseq) # GC contents |   length(biosDNAseq) # GC contents | ||||||
|  |  | ||||||
| Biostrings::dinucleotideFrequency(biosDNAseq) | Biostrings::dinucleotideFrequency(biosDNAseq) | ||||||
| barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5) | barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5) | ||||||
|  |  | ||||||
| (triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq)) | (triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq)) | ||||||
| barplot(sort(triNuc), col="#4499EE33") | barplot(sort(triNuc), col="#4499EE33") | ||||||
| triNuc[triNuc == max(triNuc)] | triNuc[triNuc == max(triNuc)] | ||||||
| triNuc[triNuc == min(triNuc)] | triNuc[triNuc == min(triNuc)] | ||||||
| max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT | max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT | ||||||
|  |  | ||||||
| # compare to a shuffled sequence: | # compare to a shuffled sequence: | ||||||
| (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) | (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) | ||||||
| barplot(sort(triNuc), col="#EEEE4433", add = TRUE) | barplot(sort(triNuc), col="#EEEE4433", add = TRUE) | ||||||
| max(triNuc) | max(triNuc) | ||||||
| # Interpret this plot. | # Interpret this plot. | ||||||
| (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) | (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) | ||||||
| barplot(sort(triNuc), col="#EEEE4433") | barplot(sort(triNuc), col="#EEEE4433") | ||||||
| max(triNuc) | max(triNuc) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.2  Subsetting  ======================================================== | # ==   3.2  Subsetting  ======================================================== | ||||||
|  |  | ||||||
| # Subsetting any XString object works as expected: | # Subsetting any XString object works as expected: | ||||||
| biosDNAseq[4:15] | biosDNAseq[4:15] | ||||||
|  |  | ||||||
| # ... well - maybe not expected, because rawSeq[4:15] would not work. | # ... well - maybe not expected, because rawSeq[4:15] would not work. | ||||||
|  |  | ||||||
| # Alternatively to the "[" operator, use the subseq() function - especially for | # Alternatively to the "[" operator, use the subseq() function - especially for | ||||||
| # long sequences. This is far more efficient. | # long sequences. This is far more efficient. | ||||||
| Biostrings::subseq(biosDNAseq, start = 1, end = 30) | Biostrings::subseq(biosDNAseq, start = 1, end = 30) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.3  Operators  ========================================================= | # ==   3.3  Operators  ========================================================= | ||||||
|  |  | ||||||
| # RNAstring() and DNAstring() objects compare U and T as equals! | # RNAstring() and DNAstring() objects compare U and T as equals! | ||||||
|   Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == |   Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == | ||||||
|   Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") |   Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.4  Transformations  =================================================== | # ==   3.4  Transformations  =================================================== | ||||||
|  |  | ||||||
| biosDNAseq[4:15] | biosDNAseq[4:15] | ||||||
| Biostrings::reverseComplement(biosDNAseq[4:15]) | Biostrings::reverseComplement(biosDNAseq[4:15]) | ||||||
| Biostrings::translate(biosDNAseq[4:15]) | Biostrings::translate(biosDNAseq[4:15]) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Getting Data out of Biostrings:: Objects  ============================ | # =    4  Getting Data out of Biostrings:: Objects  ============================ | ||||||
|  |  | ||||||
| # If you need a character object, use toString(): | # If you need a character object, use toString(): | ||||||
|  |  | ||||||
| Biostrings::toString(biosDNAseq[4:15]) | Biostrings::toString(biosDNAseq[4:15]) | ||||||
|  |  | ||||||
| # saveRDS() and readRDS() works like on all other R objects. | # saveRDS() and readRDS() works like on all other R objects. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    5  More  ================================================================ | # =    5  More  ================================================================ | ||||||
|  |  | ||||||
| # ==   5.1  Views  ============================================================= | # ==   5.1  Views  ============================================================= | ||||||
|  |  | ||||||
| # Biostring "Views" are objects that store multiple substrings of one | # Biostring "Views" are objects that store multiple substrings of one | ||||||
| # Biostring object. | # Biostring object. | ||||||
|  |  | ||||||
| (myView <- Biostrings::Views(biosDNAseq, | (myView <- Biostrings::Views(biosDNAseq, | ||||||
|                              start = c(1, 19, 37), |                              start = c(1, 19, 37), | ||||||
|                              end = c(15, 30, 45))) |                              end = c(15, 30, 45))) | ||||||
|  |  | ||||||
| # Views are convenient to store feature annotations | # Views are convenient to store feature annotations | ||||||
| names(myView) <- c("Feature-A", "Feature-B", "Feature-C") | names(myView) <- c("Feature-A", "Feature-B", "Feature-C") | ||||||
| cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView )) | cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView )) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   5.2  Iranges  =========================================================== | # ==   5.2  Iranges  =========================================================== | ||||||
|  |  | ||||||
| # Biostrings:: Iranges are like Views with a common start point. These can be | # Biostrings:: Iranges are like Views with a common start point. These can be | ||||||
| # useful for feature annotations. Instead of start/end you store start/width. | # useful for feature annotations. Instead of start/end you store start/width. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   5.3  StringSets  ======================================================== | # ==   5.3  StringSets  ======================================================== | ||||||
|  |  | ||||||
| # Biostring "StringSets" store multiple sequences. | # Biostring "StringSets" store multiple sequences. | ||||||
| # | # | ||||||
| ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA") | ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA") | ||||||
| sample(ompA) # sample can work directly on a Biostring object to shuffle it | sample(ompA) # sample can work directly on a Biostring object to shuffle it | ||||||
|  |  | ||||||
| x <- Biostrings::toString(ompA) | x <- Biostrings::toString(ompA) | ||||||
| for (i in 2:10) { | for (i in 2:10) { | ||||||
|   x[i] <- Biostrings::toString(sample(ompA)) |   x[i] <- Biostrings::toString(sample(ompA)) | ||||||
| } | } | ||||||
| shuffledPeptideSet <- Biostrings::AAStringSet(x) | shuffledPeptideSet <- Biostrings::AAStringSet(x) | ||||||
| names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep="")) | names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep="")) | ||||||
| shuffledPeptideSet | shuffledPeptideSet | ||||||
|  |  | ||||||
| length(shuffledPeptideSet) | length(shuffledPeptideSet) | ||||||
| Biostrings::width(shuffledPeptideSet) | Biostrings::width(shuffledPeptideSet) | ||||||
| Biostrings::alphabetFrequency(shuffledPeptideSet) | Biostrings::alphabetFrequency(shuffledPeptideSet) | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,165 +1,165 @@ | |||||||
| # tocID <- "RPR-ChimeraX_remote.R" | # tocID <- "RPR-ChimeraX_remote.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code demonstrating remote scripting of ChimeraX. | #              R code demonstrating remote scripting of ChimeraX. | ||||||
| # | # | ||||||
| # Version:  1.0.1 | # Version:  1.0.1 | ||||||
| # | # | ||||||
| # Date:     2020-09 | # Date:     2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.0.1  2021 Minimal updates | #           1.0.1  2021 Minimal updates | ||||||
| #           1.0    First ABC units version | #           1.0    First ABC units version | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| #    %-encode and escape quotes, or just pass-through? | #    %-encode and escape quotes, or just pass-through? | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                  Line | #TOC>   Section  Title                                  Line | ||||||
| #TOC> ------------------------------------------------------ | #TOC> ------------------------------------------------------ | ||||||
| #TOC>   1        ChimeraX REMOTE SCRIPTING                41 | #TOC>   1        ChimeraX REMOTE SCRIPTING                41 | ||||||
| #TOC>   1.1        Defining a Port                        59 | #TOC>   1.1        Defining a Port                        59 | ||||||
| #TOC>   1.2        Open ChimeraX                          81 | #TOC>   1.2        Open ChimeraX                          81 | ||||||
| #TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113 | #TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  ChimeraX REMOTE SCRIPTING  =========================================== | # =    1  ChimeraX REMOTE SCRIPTING  =========================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # One of the cool features of ChimeraX is that it can be driven by Python code, | # One of the cool features of ChimeraX is that it can be driven by Python code, | ||||||
| # both within a running session and through Python scripts. What I find even | # both within a running session and through Python scripts. What I find even | ||||||
| # cooler though is that ChimeraX can be driven from any programming language via | # cooler though is that ChimeraX can be driven from any programming language via | ||||||
| # its remote control function that can listen to commands sent from any other | # its remote control function that can listen to commands sent from any other | ||||||
| # application. The interface that is used here is the standard REST (method) - | # application. The interface that is used here is the standard REST (method) - | ||||||
| # the GET and POST verbs that ubiquitously underly the communication of clients | # the GET and POST verbs that ubiquitously underly the communication of clients | ||||||
| # and servers on the Web. | # and servers on the Web. | ||||||
|  |  | ||||||
| # In order to establish the communication between this script and ChimeraX, all | # In order to establish the communication between this script and ChimeraX, all | ||||||
| # we need to do is: | # we need to do is: | ||||||
| #  - open ChimeraX; | #  - open ChimeraX; | ||||||
| #  - tell it to listen on a specific "port"; | #  - tell it to listen on a specific "port"; | ||||||
| #  - send commands to that port via httr:: | #  - send commands to that port via httr:: | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.1  Defining a Port  =================================================== | # ==   1.1  Defining a Port  =================================================== | ||||||
|  |  | ||||||
| # The httr:: package needs to be available | # The httr:: package needs to be available | ||||||
|  |  | ||||||
| if (! requireNamespace("httr", quietly = TRUE)) { | if (! requireNamespace("httr", quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = httr)       # basic information | #  library(help = httr)       # basic information | ||||||
| #  browseVignettes("httr")    # available vignettes | #  browseVignettes("httr")    # available vignettes | ||||||
| #  data(package = "httr")     # available datasets | #  data(package = "httr")     # available datasets | ||||||
|  |  | ||||||
| # We need to think od a port. Any available port number between 49152-65535 is | # We need to think od a port. Any available port number between 49152-65535 is | ||||||
| # fine. We'll choose 61803 because that's the fractional part of the golden | # fine. We'll choose 61803 because that's the fractional part of the golden | ||||||
| # ratio. But one could choose another. | # ratio. But one could choose another. | ||||||
|  |  | ||||||
| CXPORT <- 61803 | CXPORT <- 61803 | ||||||
|  |  | ||||||
| # Check that our current version of R supports sockets (default since V 3.3) | # Check that our current version of R supports sockets (default since V 3.3) | ||||||
| capabilities("sockets")   # MUST be TRUE. If not, don't continue. | capabilities("sockets")   # MUST be TRUE. If not, don't continue. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.2  Open ChimeraX  ===================================================== | # ==   1.2  Open ChimeraX  ===================================================== | ||||||
|  |  | ||||||
| #  - Open a fresh, new session of recently updated version of ChimeraX | #  - Open a fresh, new session of recently updated version of ChimeraX | ||||||
| #  - type: | #  - type: | ||||||
| # | # | ||||||
| #       remotecontrol rest start port 61803 | #       remotecontrol rest start port 61803 | ||||||
| # | # | ||||||
| #    ... or whatever the value of CXPORT is. | #    ... or whatever the value of CXPORT is. | ||||||
|  |  | ||||||
| # Now watch what happens in ChimeraX when you execute the following line: | # Now watch what happens in ChimeraX when you execute the following line: | ||||||
| ( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") ) | ( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") ) | ||||||
|  |  | ||||||
| # The .utilities.R script includes the function CX(), based on this principle, | # The .utilities.R script includes the function CX(), based on this principle, | ||||||
| # through which you can send commands to ChimeraX | # through which you can send commands to ChimeraX | ||||||
|  |  | ||||||
| CX("camera sbs") | CX("camera sbs") | ||||||
| CX("lighting soft") | CX("lighting soft") | ||||||
| CX("color sequential #1 & protein target abc palette powderblue:orchid:white") | CX("color sequential #1 & protein target abc palette powderblue:orchid:white") | ||||||
|  |  | ||||||
| # The command echos Chimera's response if the parameter "quietly" is | # The command echos Chimera's response if the parameter "quietly" is | ||||||
| # FALSE (default), and we can silence output with quietly = TRUE : | # FALSE (default), and we can silence output with quietly = TRUE : | ||||||
| CX("info models #1 attribute num_residues") | CX("info models #1 attribute num_residues") | ||||||
| CX("info models #1 attribute num_residues", quietly = TRUE) | CX("info models #1 attribute num_residues", quietly = TRUE) | ||||||
|  |  | ||||||
| # Either way, the command also returns Chimera's responses "invisibly"; | # Either way, the command also returns Chimera's responses "invisibly"; | ||||||
| # i.e. we can use the results by assigning the output to a variable: | # i.e. we can use the results by assigning the output to a variable: | ||||||
| hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE) | hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE) | ||||||
| x <- read.table(file = textConnection(hBonds), skip = 9, | x <- read.table(file = textConnection(hBonds), skip = 9, | ||||||
|                 blank.lines.skip = TRUE, fill = TRUE) |                 blank.lines.skip = TRUE, fill = TRUE) | ||||||
| hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff") | hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff") | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  WORKED EXAMPLE: SUPERPOSITION  ======================================= | # =    2  WORKED EXAMPLE: SUPERPOSITION  ======================================= | ||||||
|  |  | ||||||
| # We superimpose the 1BM8 structure with the 1DUX crystal structure to be able | # We superimpose the 1BM8 structure with the 1DUX crystal structure to be able | ||||||
| # to explore possible DNA binding regions in 1BM8 | # to explore possible DNA binding regions in 1BM8 | ||||||
|  |  | ||||||
| # The model for 1BM8 is already open as model 1  (#1) | # The model for 1BM8 is already open as model 1  (#1) | ||||||
| CX("hide #1 cartoons")        # hide model 1 cartoon representation | CX("hide #1 cartoons")        # hide model 1 cartoon representation | ||||||
| CX("open 1DUX")               # assume this is opened as model #2 | CX("open 1DUX")               # assume this is opened as model #2 | ||||||
| CX("hide #2")                 # hide everything ... | CX("hide #2")                 # hide everything ... | ||||||
| CX("select #2/C")             # chain c (protein) | CX("select #2/C")             # chain c (protein) | ||||||
| CX("show sel cartoons")       # ... and show cartoons of chain c (protein) | CX("show sel cartoons")       # ... and show cartoons of chain c (protein) | ||||||
| CX("color sequential sel target c palette steelblue:darkmagenta") | CX("color sequential sel target c palette steelblue:darkmagenta") | ||||||
| CX("view #2/C")               # re-center the display | CX("view #2/C")               # re-center the display | ||||||
| CX("cofr #2/C:62@CA")         # set pivot to an interface residue | CX("cofr #2/C:62@CA")         # set pivot to an interface residue | ||||||
| CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA | CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA | ||||||
| CX("style sel stick") | CX("style sel stick") | ||||||
| CX("show sel target ab")      # show atoms/bonds | CX("show sel target ab")      # show atoms/bonds | ||||||
| CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan") | CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan") | ||||||
| CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan") | CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan") | ||||||
| CX("surface sel enclose sel") # compute joint accessible surface of both chains | CX("surface sel enclose sel") # compute joint accessible surface of both chains | ||||||
| CX("transparency 50") | CX("transparency 50") | ||||||
| CX("select clear") | CX("select clear") | ||||||
|  |  | ||||||
| # Now superimpose the 1BM8 chain onto 1DUX chain C | # Now superimpose the 1BM8 chain onto 1DUX chain C | ||||||
| CX("show #1 cartoons") | CX("show #1 cartoons") | ||||||
| CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition | CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition | ||||||
|  |  | ||||||
| # study the general layout, and the position of the 1mb8 secondary structure | # study the general layout, and the position of the 1mb8 secondary structure | ||||||
| # elements relative to 1DUX | # elements relative to 1DUX | ||||||
|  |  | ||||||
| # Let's examine side chain orientations in more detail | # Let's examine side chain orientations in more detail | ||||||
| CX("hide #2/C cartoons")  # hide the 1DUX protein | CX("hide #2/C cartoons")  # hide the 1DUX protein | ||||||
|  |  | ||||||
| # select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b) | # select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b) | ||||||
| CX("select zone #2/A,B 3.5 #1 & protein residues true") | CX("select zone #2/A,B 3.5 #1 & protein residues true") | ||||||
| CX("~select sel & H")  # de-select H atoms | CX("~select sel & H")  # de-select H atoms | ||||||
| CX("show sel target ab") | CX("show sel target ab") | ||||||
| CX("size stickRadius 0.4") | CX("size stickRadius 0.4") | ||||||
| CX("select clear") | CX("select clear") | ||||||
|  |  | ||||||
| # The overall architecture of the Mbp1 APSES domain is a good match for the Elk | # The overall architecture of the Mbp1 APSES domain is a good match for the Elk | ||||||
| # transcription factor binding mode; the detailed conformations of side chains | # transcription factor binding mode; the detailed conformations of side chains | ||||||
| # would need to change only to a minor degree. There is a very significant | # would need to change only to a minor degree. There is a very significant | ||||||
| # degree of structural similarity; remarkable, given that the DNA is not the | # degree of structural similarity; remarkable, given that the DNA is not the | ||||||
| # target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was | # target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was | ||||||
| # determined without a DNA ligand. | # determined without a DNA ligand. | ||||||
|  |  | ||||||
| CX("remotecontrol rest stop")  # release the socket | CX("remotecontrol rest stop")  # release the socket | ||||||
| # Done. | # Done. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										644
									
								
								RPR-FASTA.R
									
									
									
									
									
								
							
							
						
						
									
										644
									
								
								RPR-FASTA.R
									
									
									
									
									
								
							| @@ -1,322 +1,322 @@ | |||||||
| # tocID <- "RPR-FASTA.R" | # tocID <- "RPR-FASTA.R" | ||||||
| # | # | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-FASTA unit. | #              R code accompanying the RPR-FASTA unit. | ||||||
| # | # | ||||||
| # Version:  1.1.2 | # Version:  1.1.2 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2021-09 | # Date:     2017-10  -  2021-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.1.2  style update | #           1.1.2  style update | ||||||
| #           1.1.1  bugfix - wrong function name | #           1.1.1  bugfix - wrong function name | ||||||
| #           1.1    2020 Maintenance. Rewrite validation logic. Add data | #           1.1    2020 Maintenance. Rewrite validation logic. Add data | ||||||
| #                  to utilities. Define AACOLS | #                  to utilities. Define AACOLS | ||||||
| #           1.0    New unit. | #           1.0    New unit. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: Make a simple solution first, then extend it to error checking, and | # TODO: Make a simple solution first, then extend it to error checking, and | ||||||
| #       to handle .mfa files. | #       to handle .mfa files. | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC> | #TOC> | ||||||
| #TOC>   Section  Title                                 Line | #TOC>   Section  Title                                 Line | ||||||
| #TOC> ----------------------------------------------------- | #TOC> ----------------------------------------------------- | ||||||
| #TOC>   1        Reading and validating FASTA            45 | #TOC>   1        Reading and validating FASTA            45 | ||||||
| #TOC>   1.1        Validating FASTA                      81 | #TOC>   1.1        Validating FASTA                      81 | ||||||
| #TOC>   2        Parsing FASTA                          227 | #TOC>   2        Parsing FASTA                          227 | ||||||
| #TOC>   3        Interpreting FASTA                     247 | #TOC>   3        Interpreting FASTA                     247 | ||||||
| #TOC>   4        Writing FASTA                          274 | #TOC>   4        Writing FASTA                          274 | ||||||
| #TOC> | #TOC> | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Reading and validating FASTA  ======================================== | # =    1  Reading and validating FASTA  ======================================== | ||||||
|  |  | ||||||
| # FASTA is a text based format, structured in lines that are separated by | # FASTA is a text based format, structured in lines that are separated by | ||||||
| # line-feed or paragraph-break characters. Which one of these is used, depends | # line-feed or paragraph-break characters. Which one of these is used, depends | ||||||
| # on your operating system. But R's readLines() function knows how to handle | # on your operating system. But R's readLines() function knows how to handle | ||||||
| # these correctly, accross platforms. Don't try to read such files "by hand". | # these correctly, accross platforms. Don't try to read such files "by hand". | ||||||
| # Here is the yeast Mbp1 gene, via SGD. | # Here is the yeast Mbp1 gene, via SGD. | ||||||
|  |  | ||||||
| file.show("./data/S288C_YDL056W_MBP1_coding.fsa") | file.show("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||||
| faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||||
|  |  | ||||||
| # The warning is generated because the programmer at the NCBI who implemented | # The warning is generated because the programmer at the NCBI who implemented | ||||||
| # the code to write this FASTA file neglected to place a line-break character | # the code to write this FASTA file neglected to place a line-break character | ||||||
| # after the last sequence character. While this is not technically incorrect, | # after the last sequence character. While this is not technically incorrect, | ||||||
| # it is poor practice: the resulting file can't be distinguished from one that | # it is poor practice: the resulting file can't be distinguished from one that | ||||||
| # has been truncated in transmission. | # has been truncated in transmission. | ||||||
|  |  | ||||||
| head(faMBP1) | head(faMBP1) | ||||||
|  |  | ||||||
| # Note that there are NO line-break characters ("\n") at the end of these | # Note that there are NO line-break characters ("\n") at the end of these | ||||||
| # strings, even though they were present in the original file. readLines() | # strings, even though they were present in the original file. readLines() | ||||||
| # has "consumed" these characters while reading - but every single line is in | # has "consumed" these characters while reading - but every single line is in | ||||||
| # a vector of its own. | # a vector of its own. | ||||||
|  |  | ||||||
| tail(faMBP1) | tail(faMBP1) | ||||||
|  |  | ||||||
| # Also note that the last line has fewer characters - this means readLines() | # Also note that the last line has fewer characters - this means readLines() | ||||||
| # imported the whole line, despite it not being terminated by "\n". | # imported the whole line, despite it not being terminated by "\n". | ||||||
|  |  | ||||||
| # It's very straightforward to work with such data, for example by collapsing | # It's very straightforward to work with such data, for example by collapsing | ||||||
| # everything except the first line into a single string ... | # everything except the first line into a single string ... | ||||||
|  |  | ||||||
| f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = "")) | f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = "")) | ||||||
|  |  | ||||||
| f[1] | f[1] | ||||||
| nchar(f[2]) | nchar(f[2]) | ||||||
|  |  | ||||||
| # ==   1.1  Validating FASTA  ================================================== | # ==   1.1  Validating FASTA  ================================================== | ||||||
|  |  | ||||||
| # The code above is making the assumption that everything from line 2 until | # The code above is making the assumption that everything from line 2 until | ||||||
| #  the end IS sequence, the whole sequence and nothing but sequence. | #  the end IS sequence, the whole sequence and nothing but sequence. | ||||||
| #  That assumption can break down in many ways: | #  That assumption can break down in many ways: | ||||||
| # | # | ||||||
| #  - there could be more than one header line. The specification says otherwise, | #  - there could be more than one header line. The specification says otherwise, | ||||||
| #       but some older files use multiple, consecutive header lines. You don't | #       but some older files use multiple, consecutive header lines. You don't | ||||||
| #       want that to end up in your sequence. | #       want that to end up in your sequence. | ||||||
| #  - this could be not a FASTA file at all. It could be raw sequence, a | #  - this could be not a FASTA file at all. It could be raw sequence, a | ||||||
| #       different sequence file format, or a wholly different file altogether. | #       different sequence file format, or a wholly different file altogether. | ||||||
| #       If you look at the file, you can immediately tell, but if you are | #       If you look at the file, you can immediately tell, but if you are | ||||||
| #       reading the file in a complex workflow, your could easily import wrong | #       reading the file in a complex workflow, your could easily import wrong | ||||||
| #       data into your analysis. | #       data into your analysis. | ||||||
| #  - there could be more than one sequence in the file. Such Multi-FASTA files | #  - there could be more than one sequence in the file. Such Multi-FASTA files | ||||||
| #       occur commonly, as downloads of ORFs from genome regions or other | #       occur commonly, as downloads of ORFs from genome regions or other | ||||||
| #       sets of genes or proteins, or as the input / output for multiple | #       sets of genes or proteins, or as the input / output for multiple | ||||||
| #       sequence alignment programs. | #       sequence alignment programs. | ||||||
| # | # | ||||||
| # Data "from the wild" can (and usually does) have the most unexpected | # Data "from the wild" can (and usually does) have the most unexpected | ||||||
| # variations and it is really, really important to be clear about the | # variations and it is really, really important to be clear about the | ||||||
| # assumptions that you are making. It is possible to "fix" things, according | # assumptions that you are making. It is possible to "fix" things, according | ||||||
| # to the "Robustness Principle" : | # to the "Robustness Principle" : | ||||||
| #      "Be conservative in what you send, | #      "Be conservative in what you send, | ||||||
| #       be liberal in what you accept". | #       be liberal in what you accept". | ||||||
| #       (cf. https://en.wikipedia.org/wiki/Robustness_principle ) | #       (cf. https://en.wikipedia.org/wiki/Robustness_principle ) | ||||||
| # ... but if you think about this, that's actually a really poor idea, | # ... but if you think about this, that's actually a really poor idea, | ||||||
| # which is much more likely to dilute standards, make unwarranted | # which is much more likely to dilute standards, make unwarranted | ||||||
| # assumptions, and allow errors to pass silently and corrupt data. | # assumptions, and allow errors to pass silently and corrupt data. | ||||||
| # | # | ||||||
| # Let's discard this principle on the trash-heap of | # Let's discard this principle on the trash-heap of | ||||||
| # things-that-sound-like-a-good-idea-but-aren't. What we do instead is test, | # things-that-sound-like-a-good-idea-but-aren't. What we do instead is test, | ||||||
| # identify problems, and follow the principle: "crash early, crash often". Of | # identify problems, and follow the principle: "crash early, crash often". Of | ||||||
| # course I can write code that would reformat any possible input as a FASTA | # course I can write code that would reformat any possible input as a FASTA | ||||||
| # file - but what good will it do me if it parses the file I receive | # file - but what good will it do me if it parses the file I receive | ||||||
| # from a server into FASTA format like: | # from a server into FASTA format like: | ||||||
| # | # | ||||||
| #   >404- Page Not Found</title</head> | #   >404- Page Not Found</title</head> | ||||||
| #   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe | #   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe | ||||||
| #   spellingrcntacttheadministratrsdyhtml | #   spellingrcntacttheadministratrsdyhtml | ||||||
| # | # | ||||||
| # Therefore, we write ourselves a FASTA checker that will enforce the following: | # Therefore, we write ourselves a FASTA checker that will enforce the following: | ||||||
| #   (1) a FASTA file contains one or more sequences separated by zero or | #   (1) a FASTA file contains one or more sequences separated by zero or | ||||||
| #       more empty lines | #       more empty lines | ||||||
| #   (2) a sequence contains one header line followed by | #   (2) a sequence contains one header line followed by | ||||||
| #       one or more sequence lines | #       one or more sequence lines | ||||||
| #   (3) a sequence line contains one or more uppercase or lowercase single | #   (3) a sequence line contains one or more uppercase or lowercase single | ||||||
| #       letter amino acid codes, hyphens (gap character), or * (stop). | #       letter amino acid codes, hyphens (gap character), or * (stop). | ||||||
| # | # | ||||||
| #   Anything else should generate an error. | #   Anything else should generate an error. | ||||||
|  |  | ||||||
| #   (Case 1): Header(s) exist | #   (Case 1): Header(s) exist | ||||||
| fX <- c("ABC", | fX <- c("ABC", | ||||||
|         "defghi", |         "defghi", | ||||||
|         "klmnpq") |         "klmnpq") | ||||||
| sel <- grepl("^>", fX)  # "^>" is a regular expression that | sel <- grepl("^>", fX)  # "^>" is a regular expression that | ||||||
|                         # means: the exact character ">" at the |                         # means: the exact character ">" at the | ||||||
|                         # beginning ("^") of the line. |                         # beginning ("^") of the line. | ||||||
| if ( ! any(sel) ) { stop("no header lines in input.") } | if ( ! any(sel) ) { stop("no header lines in input.") } | ||||||
|  |  | ||||||
|  |  | ||||||
| #   (Case 2) No adjacent header lines | #   (Case 2) No adjacent header lines | ||||||
| fX <- c(">ABC", | fX <- c(">ABC", | ||||||
|         ">123", |         ">123", | ||||||
|         "defghi", |         "defghi", | ||||||
|         "klmnpq") |         "klmnpq") | ||||||
| sel <- grepl("^>", fX) | sel <- grepl("^>", fX) | ||||||
| sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors | sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors | ||||||
| if ( any(sel)) { stop("adjacent header lines in input.") } | if ( any(sel)) { stop("adjacent header lines in input.") } | ||||||
|  |  | ||||||
| #   (Case 3.1) all sequence lines contain only valid characters | #   (Case 3.1) all sequence lines contain only valid characters | ||||||
| #              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG | #              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG | ||||||
| #               are defined with the .utilities.R script) | #               are defined with the .utilities.R script) | ||||||
| AAVALID | AAVALID | ||||||
| fX <- c(">ABC", | fX <- c(">ABC", | ||||||
|         "def ;-) ghi", |         "def ;-) ghi", | ||||||
|         "klmnpq") |         "klmnpq") | ||||||
| myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character | myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character | ||||||
| sel <- ! grepl("^>", fX)              # NOT headers | sel <- ! grepl("^>", fX)              # NOT headers | ||||||
| if (any(grepl(myRegex, fX[sel]))) { | if (any(grepl(myRegex, fX[sel]))) { | ||||||
|   stop("invalid chracter(s) outside of header lines.") |   stop("invalid chracter(s) outside of header lines.") | ||||||
| } | } | ||||||
|  |  | ||||||
| #   (Case 3.2) all headers are followed directly by | #   (Case 3.2) all headers are followed directly by | ||||||
| #              at least one letter of sequence | #              at least one letter of sequence | ||||||
| fX <- c(">ABC", | fX <- c(">ABC", | ||||||
|         "", |         "", | ||||||
|         ">123", |         ">123", | ||||||
|         "defghi", |         "defghi", | ||||||
|         "klmnpq") |         "klmnpq") | ||||||
| sel <- grep("^>", fX) + 1             # indexes of headers + 1 | sel <- grep("^>", fX) + 1             # indexes of headers + 1 | ||||||
| myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character | myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character | ||||||
| if (! all(grepl(myRegex, fX[sel]))) { | if (! all(grepl(myRegex, fX[sel]))) { | ||||||
|   stop("a header has no adjacent sequence.") |   stop("a header has no adjacent sequence.") | ||||||
| } | } | ||||||
| # Ah, you might ask - couldn't we just have dropped all empty lines, and | # Ah, you might ask - couldn't we just have dropped all empty lines, and | ||||||
| # then caught this in Case 2? No - for two reasons: we would still miss headers | # then caught this in Case 2? No - for two reasons: we would still miss headers | ||||||
| # at the end of file, and, we would have changed the line numbering - and | # at the end of file, and, we would have changed the line numbering - and | ||||||
| # ideally our "production" function will create information about where the | # ideally our "production" function will create information about where the | ||||||
| # error is to be found. | # error is to be found. | ||||||
|  |  | ||||||
|  |  | ||||||
| # Now combine this into a function ... | # Now combine this into a function ... | ||||||
|  |  | ||||||
| val <- function(fa) { | val <- function(fa) { | ||||||
|  |  | ||||||
|   if ( ! any(grepl("^>", fa)) ) { |   if ( ! any(grepl("^>", fa)) ) { | ||||||
|     stop("no header lines in input.") |     stop("no header lines in input.") | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   sel <- grepl("^>", fa) |   sel <- grepl("^>", fa) | ||||||
|   if ( any(sel[- length(sel)] & sel[-1])) { |   if ( any(sel[- length(sel)] & sel[-1])) { | ||||||
|     stop("adjacent header lines in input.") |     stop("adjacent header lines in input.") | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   sel <- ! grepl("^>", fa) |   sel <- ! grepl("^>", fa) | ||||||
|   if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) { |   if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) { | ||||||
|     stop("invalid chracter(s) outside of header lines.") |     stop("invalid chracter(s) outside of header lines.") | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   sel <- grep("^>", fa) + 1 |   sel <- grep("^>", fa) + 1 | ||||||
|   if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) { |   if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) { | ||||||
|     stop("a header has no adjacent sequence.") |     stop("a header has no adjacent sequence.") | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return(invisible(NULL)) |   return(invisible(NULL)) | ||||||
| } | } | ||||||
|  |  | ||||||
| # Here is an example | # Here is an example | ||||||
| FA <- c(">head1", | FA <- c(">head1", | ||||||
|         "acdef", |         "acdef", | ||||||
|         "ghi", |         "ghi", | ||||||
|         "", |         "", | ||||||
|         ">head2", |         ">head2", | ||||||
|         "kl", |         "kl", | ||||||
|         ">head3", |         ">head3", | ||||||
|         "mn", |         "mn", | ||||||
|         "pqrs") |         "pqrs") | ||||||
| val(FA)     # ... should not create an error | val(FA)     # ... should not create an error | ||||||
|  |  | ||||||
|  |  | ||||||
| # A somewhat more elaborate validateFA() function was loaded with the | # A somewhat more elaborate validateFA() function was loaded with the | ||||||
| # ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi- | # ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi- | ||||||
| # fasta files have space-characters in their spacer lines. Try it ... | # fasta files have space-characters in their spacer lines. Try it ... | ||||||
| validateFA(FA) | validateFA(FA) | ||||||
|  |  | ||||||
| # =    2  Parsing FASTA  ======================================================= | # =    2  Parsing FASTA  ======================================================= | ||||||
|  |  | ||||||
| # Once we have validated our assumptions about our input, it's quite | # Once we have validated our assumptions about our input, it's quite | ||||||
| # painless to parse it. I have put this together as a function and the function | # painless to parse it. I have put this together as a function and the function | ||||||
| # gets loaded from ./.utilities.R | # gets loaded from ./.utilities.R | ||||||
| # | # | ||||||
|  |  | ||||||
| # Lets try this: | # Lets try this: | ||||||
| #   - the first 3 elements of faMBP1: | #   - the first 3 elements of faMBP1: | ||||||
| readFASTA(faMBP1[1:3]) | readFASTA(faMBP1[1:3]) | ||||||
|  |  | ||||||
| #   - a multi FASTA file of aligned APSES domain sequences: | #   - a multi FASTA file of aligned APSES domain sequences: | ||||||
|  |  | ||||||
| refAPSES <- readFASTA("./data/refAPSES.mfa") | refAPSES <- readFASTA("./data/refAPSES.mfa") | ||||||
|  |  | ||||||
| # Subset the sequence with "P39678" in the header | # Subset the sequence with "P39678" in the header | ||||||
| refAPSES[grep("P39678", refAPSES$head) ,] | refAPSES[grep("P39678", refAPSES$head) ,] | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Interpreting FASTA  ================================================== | # =    3  Interpreting FASTA  ================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # FASTA files are straightforward to interpret - just one thing may be of note: | # FASTA files are straightforward to interpret - just one thing may be of note: | ||||||
| # when working with strings, we can use substr(<string>, <start>, <stop>) to | # when working with strings, we can use substr(<string>, <start>, <stop>) to | ||||||
| # extract substrings, but more often we expand the string into a vector of | # extract substrings, but more often we expand the string into a vector of | ||||||
| # single characters with strsplit(<string>, ""). strsplit() returns a list, | # single characters with strsplit(<string>, ""). strsplit() returns a list, | ||||||
| # to accommodate that <string> could be a vector of many elements, therefore | # to accommodate that <string> could be a vector of many elements, therefore | ||||||
| # we usually unlist() the result if we use it only on a single string. | # we usually unlist() the result if we use it only on a single string. | ||||||
|  |  | ||||||
| # Example: How many positive charged residues in "MBP1_SACCE"? | # Example: How many positive charged residues in "MBP1_SACCE"? | ||||||
|  |  | ||||||
| s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], "")) | s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], "")) | ||||||
| s | s | ||||||
|  |  | ||||||
| sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE | sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE | ||||||
|                        # for the characters, sum() coerces to 1 and 0 |                        # for the characters, sum() coerces to 1 and 0 | ||||||
|                        # respectively, and that gives us the result. |                        # respectively, and that gives us the result. | ||||||
|  |  | ||||||
| 100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 % | 100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 % | ||||||
|  |  | ||||||
| # residue distribution | # residue distribution | ||||||
| x <- factor(s, levels = names(AACOLS)) | x <- factor(s, levels = names(AACOLS)) | ||||||
| pie(table(x)[names(AACOLS)], col = AACOLS) | pie(table(x)[names(AACOLS)], col = AACOLS) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Writing FASTA  ======================================================= | # =    4  Writing FASTA  ======================================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| # Writing FASTA files is mostly just the reverse of reading, with one | # Writing FASTA files is mostly just the reverse of reading, with one | ||||||
| # twist: we need to break the long sequence string into chunks of the desired | # twist: we need to break the long sequence string into chunks of the desired | ||||||
| # width. The FASTA specification calls for a maximum of 120 characters per line, | # width. The FASTA specification calls for a maximum of 120 characters per line, | ||||||
| # but writing out much less than that is common, since it allows to comfortably | # but writing out much less than that is common, since it allows to comfortably | ||||||
| # view lines on the console, or printing them on a sheet of paper (do we still | # view lines on the console, or printing them on a sheet of paper (do we still | ||||||
| # do that actually?). How do we break a string into chunks? A combination of | # do that actually?). How do we break a string into chunks? A combination of | ||||||
| # seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work | # seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work | ||||||
| # nicely. (Note that substring() is vectorized, whereas substr() is not!) As we | # nicely. (Note that substring() is vectorized, whereas substr() is not!) As we | ||||||
| # loop through our FASTA object in memory, we can build the output by c()'ing | # loop through our FASTA object in memory, we can build the output by c()'ing | ||||||
| # blocks of header + sequence to each other. For VERY large objects this might | # blocks of header + sequence to each other. For VERY large objects this might | ||||||
| # be slow - in that case, we might want to precalculate the size of the output | # be slow - in that case, we might want to precalculate the size of the output | ||||||
| # object. But that's more of a hypothetical consideration. | # object. But that's more of a hypothetical consideration. | ||||||
|  |  | ||||||
| ( s <- refAPSES$seq[2] ) | ( s <- refAPSES$seq[2] ) | ||||||
| nchar(s) | nchar(s) | ||||||
| w <- 30     # width of chunk | w <- 30     # width of chunk | ||||||
| (starts <- seq(1, nchar(s), by = w))      # starting index of chunk | (starts <- seq(1, nchar(s), by = w))      # starting index of chunk | ||||||
| (ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk | (ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk | ||||||
|  |  | ||||||
| # Task: Is this safe? What happens if nchar(s) is shorter than w? | # Task: Is this safe? What happens if nchar(s) is shorter than w? | ||||||
| #       What happens if nchar(s) is an exact multiple of w? | #       What happens if nchar(s) is an exact multiple of w? | ||||||
|  |  | ||||||
| substring(s, starts, ends) | substring(s, starts, ends) | ||||||
| # confirm that the output contains the first and last residue, and both | # confirm that the output contains the first and last residue, and both | ||||||
| # residues adjacent to the breaks | # residues adjacent to the breaks | ||||||
|  |  | ||||||
| # As always, the function has been defined in ".utilities.R" for to use | # As always, the function has been defined in ".utilities.R" for to use | ||||||
| # any time...  type   writeFASTA  to examine it. | # any time...  type   writeFASTA  to examine it. | ||||||
|  |  | ||||||
| # Let's try this... | # Let's try this... | ||||||
|  |  | ||||||
| writeFASTA(refAPSES, width = 40) | writeFASTA(refAPSES, width = 40) | ||||||
|  |  | ||||||
| # roundtrip for validation: write refAPSES with a different format, | # roundtrip for validation: write refAPSES with a different format, | ||||||
| # read it back in - the new dataframe must be identical | # read it back in - the new dataframe must be identical | ||||||
| # to the original dataframe. | # to the original dataframe. | ||||||
| fname <- tempfile() | fname <- tempfile() | ||||||
| writeFASTA(refAPSES, fn = fname, width = 30) | writeFASTA(refAPSES, fn = fname, width = 30) | ||||||
| identical(refAPSES, readFASTA(fname)) | identical(refAPSES, readFASTA(fname)) | ||||||
|  |  | ||||||
| # ...works for me  :-) | # ...works for me  :-) | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										1348
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							
							
						
						
									
										1348
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,385 +1,385 @@ | |||||||
| # tocID <- "RPR-Genetic_code_optimality.R" | # tocID <- "RPR-Genetic_code_optimality.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Genetic_code_optimality unit. | #              R code accompanying the RPR-Genetic_code_optimality unit. | ||||||
| # | # | ||||||
| # Version:  1.3 | # Version:  1.3 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2020-09 | # Date:     2017-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.3    2020 Maintenance | #           1.3    2020 Maintenance | ||||||
| #           1.2    Change from require() to requireNamespace(), | #           1.2    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #                      use Biocmanager:: not biocLite() | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.1      Update set.seed() usage | #           1.1      Update set.seed() usage | ||||||
| #           1.0.1    Fixed two bugs discovered by Suan Chin Yeo. | #           1.0.1    Fixed two bugs discovered by Suan Chin Yeo. | ||||||
| #           1.0      New material. | #           1.0      New material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                          Line | #TOC>   Section  Title                                          Line | ||||||
| #TOC> -------------------------------------------------------------- | #TOC> -------------------------------------------------------------- | ||||||
| #TOC>   1        Designing a computational experiment             58 | #TOC>   1        Designing a computational experiment             58 | ||||||
| #TOC>   2        Setting up the tools                             74 | #TOC>   2        Setting up the tools                             74 | ||||||
| #TOC>   2.1        Natural and alternative genetic codes          77 | #TOC>   2.1        Natural and alternative genetic codes          77 | ||||||
| #TOC>   2.2        Effect of mutations                           135 | #TOC>   2.2        Effect of mutations                           135 | ||||||
| #TOC>   2.2.1          reverse-translate                         146 | #TOC>   2.2.1          reverse-translate                         146 | ||||||
| #TOC>   2.2.2          Randomly mutate                           171 | #TOC>   2.2.2          Randomly mutate                           171 | ||||||
| #TOC>   2.2.3          Forward- translate                        196 | #TOC>   2.2.3          Forward- translate                        196 | ||||||
| #TOC>   2.2.4          measure effect                            213 | #TOC>   2.2.4          measure effect                            213 | ||||||
| #TOC>   3        Run the experiment                              267 | #TOC>   3        Run the experiment                              267 | ||||||
| #TOC>   4        Task solutions                                  363 | #TOC>   4        Task solutions                                  363 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # This unit demonstrates R code to simulate alternate genetic codes and evaluate | # This unit demonstrates R code to simulate alternate genetic codes and evaluate | ||||||
| # their robsustness to code changes. The approaches are quite simple and you | # their robsustness to code changes. The approaches are quite simple and you | ||||||
| # will be able to come up with obvious refinements; the point of this code is to | # will be able to come up with obvious refinements; the point of this code is to | ||||||
| # demonstrate some R programming techniques, in preparation for more | # demonstrate some R programming techniques, in preparation for more | ||||||
| # sophisticated questions later. | # sophisticated questions later. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Designing a computational experiment  ================================ | # =    1  Designing a computational experiment  ================================ | ||||||
|  |  | ||||||
| # Computational experiments are conducted like wet-lab experiments. We begin | # Computational experiments are conducted like wet-lab experiments. We begin | ||||||
| # with a hypothesis, then define the observables that relate to the hypothesis, | # with a hypothesis, then define the observables that relate to the hypothesis, | ||||||
| # then define the measures we apply to observations, and finally we interpret | # then define the measures we apply to observations, and finally we interpret | ||||||
| # our observations. If we want to learn something about the evolution of the | # our observations. If we want to learn something about the evolution of the | ||||||
| # genetic code ... | # genetic code ... | ||||||
|  |  | ||||||
| #  - we construct a hypothesis such as: the genetic code has evolved so as to | #  - we construct a hypothesis such as: the genetic code has evolved so as to | ||||||
| #      minimize the effect of mutations; | #      minimize the effect of mutations; | ||||||
| #  - we define the observables: the effect of mutations in | #  - we define the observables: the effect of mutations in | ||||||
| #      sequences, given the natural and possible alternative codes; | #      sequences, given the natural and possible alternative codes; | ||||||
| #  - we define the measures to quantify the effect of mutations; | #  - we define the measures to quantify the effect of mutations; | ||||||
| #  - then we compute alternatives and interpret the results. | #  - then we compute alternatives and interpret the results. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Setting up the tools  ================================================ | # =    2  Setting up the tools  ================================================ | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   2.1  Natural and alternative genetic codes  ============================= | # ==   2.1  Natural and alternative genetic codes  ============================= | ||||||
|  |  | ||||||
| # Load genetic code tables from the Biostrings package | # Load genetic code tables from the Biostrings package | ||||||
| if (! requireNamespace("BiocManager", quietly = TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   install.packages("BiocManager") |   install.packages("BiocManager") | ||||||
| } | } | ||||||
| if (! requireNamespace("Biostrings", quietly = TRUE)) { | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   BiocManager::install("Biostrings") |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")    # available vignettes | #  browseVignettes("Biostrings")    # available vignettes | ||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # There are many ways to generate alternative codes. The simplest way is to | # There are many ways to generate alternative codes. The simplest way is to | ||||||
| # randomly assign amino acids to codons. A more sophisticated way is to keep the | # randomly assign amino acids to codons. A more sophisticated way is to keep the | ||||||
| # redundancy of codons intact, since it may reflect some form of symmetry | # redundancy of codons intact, since it may reflect some form of symmetry | ||||||
| # breaking that ignores the third nucleotide of a codon for the most part; | # breaking that ignores the third nucleotide of a codon for the most part; | ||||||
| # therefore we only replace the amino acids of the existing code with random | # therefore we only replace the amino acids of the existing code with random | ||||||
| # others. Here are two functions that implement these two ideas about alternate | # others. Here are two functions that implement these two ideas about alternate | ||||||
| # codes. | # codes. | ||||||
|  |  | ||||||
| randomGC <- function(GC) { | randomGC <- function(GC) { | ||||||
|   # Return a genetic code with randomly assigned amino acids. |   # Return a genetic code with randomly assigned amino acids. | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #    GC   named chr  length-64 character vector of 20 amino acid one-letter |   #    GC   named chr  length-64 character vector of 20 amino acid one-letter | ||||||
|   #                       codes plus "*" (stop), named with the codon triplet. |   #                       codes plus "*" (stop), named with the codon triplet. | ||||||
|   # Value:  named chr  same vector with random amino acid assignments in which |   # Value:  named chr  same vector with random amino acid assignments in which | ||||||
|   #                       every amino acid and "*" is encoded at least once. |   #                       every amino acid and "*" is encoded at least once. | ||||||
|  |  | ||||||
|   aa <- unique(GC)                           # the amino acids in the input code |   aa <- unique(GC)                           # the amino acids in the input code | ||||||
|   GC[1:64] <- sample(aa, 64, replace = TRUE) # random code |   GC[1:64] <- sample(aa, 64, replace = TRUE) # random code | ||||||
|   while(length(unique(GC)) < length(aa)) {   # We could end up with a code that |   while(length(unique(GC)) < length(aa)) {   # We could end up with a code that | ||||||
|                                              # does not contain all amino acids, |                                              # does not contain all amino acids, | ||||||
|                                              # then we sample() again. |                                              # then we sample() again. | ||||||
|     GC[1:64] <- sample(aa, 64, replace = TRUE) |     GC[1:64] <- sample(aa, 64, replace = TRUE) | ||||||
|   } |   } | ||||||
|   return(GC) |   return(GC) | ||||||
| } | } | ||||||
|  |  | ||||||
| swappedGC <- function(GC) { | swappedGC <- function(GC) { | ||||||
|   # Return a genetic code with randomly swapped amino acids. |   # Return a genetic code with randomly swapped amino acids. | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #    GC   named chr  length-64 character vector of 20 amino acid one-letter |   #    GC   named chr  length-64 character vector of 20 amino acid one-letter | ||||||
|   #                       codes plus "*" (stop), named with the codon triplet. |   #                       codes plus "*" (stop), named with the codon triplet. | ||||||
|   # Value:  named chr  same vector with random amino acid assignments where the |   # Value:  named chr  same vector with random amino acid assignments where the | ||||||
|   #                       amino acids have been swapped. |   #                       amino acids have been swapped. | ||||||
|  |  | ||||||
|   aaOrig <- unique(GC)                       # the amino acids in the input code |   aaOrig <- unique(GC)                       # the amino acids in the input code | ||||||
|   aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled |   aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled | ||||||
|   names(aaSwap) <- aaOrig                    # name them after the original |   names(aaSwap) <- aaOrig                    # name them after the original | ||||||
|   GC[1:64] <- aaSwap[GC]                     # replace original with shuffled |   GC[1:64] <- aaSwap[GC]                     # replace original with shuffled | ||||||
|  |  | ||||||
|   return(GC) |   return(GC) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   2.2  Effect of mutations  =============================================== | # ==   2.2  Effect of mutations  =============================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # To evaluate the effects of mutations we will do the following: | # To evaluate the effects of mutations we will do the following: | ||||||
| #   - we take an amino acid sequence (Mbp1 will do just nicely); | #   - we take an amino acid sequence (Mbp1 will do just nicely); | ||||||
| #   - we reverse-translate it into a nucleotide sequence; | #   - we reverse-translate it into a nucleotide sequence; | ||||||
| #   - we mutate it randomly; | #   - we mutate it randomly; | ||||||
| #   - we translate it back to amino acids; | #   - we translate it back to amino acids; | ||||||
| #   - we count the number of mutations and evaluate their severity. | #   - we count the number of mutations and evaluate their severity. | ||||||
|  |  | ||||||
|  |  | ||||||
| # ===   2.2.1  reverse-translate                     | # ===   2.2.1  reverse-translate                     | ||||||
|  |  | ||||||
| # To reverse-translate an amino acid vector, we randomly pick one of its | # To reverse-translate an amino acid vector, we randomly pick one of its | ||||||
| # codons from a genetic code, and assemble all codons to a sequence. | # codons from a genetic code, and assemble all codons to a sequence. | ||||||
|  |  | ||||||
| traRev <- function(s, GC) { | traRev <- function(s, GC) { | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #      s   chr   a sequence vector |   #      s   chr   a sequence vector | ||||||
|   #      GC  chr   a genetic code |   #      GC  chr   a genetic code | ||||||
|   # Value: |   # Value: | ||||||
|   #      A reverse-translated vector of codons |   #      A reverse-translated vector of codons | ||||||
|   vC <- character(length(s)) |   vC <- character(length(s)) | ||||||
|  |  | ||||||
|   for (i in seq_along(s)) { |   for (i in seq_along(s)) { | ||||||
|     codon <- names(GC)[GC == s[i]]   # get all codons for this AA |     codon <- names(GC)[GC == s[i]]   # get all codons for this AA | ||||||
|     if (length(codon) > 1) {         # if there's more than one ... |     if (length(codon) > 1) {         # if there's more than one ... | ||||||
|       codon <- sample(codon, 1)      # pick one at random ... |       codon <- sample(codon, 1)      # pick one at random ... | ||||||
|     } |     } | ||||||
|     vC[i] <- codon                   # store it |     vC[i] <- codon                   # store it | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return(vC) |   return(vC) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # ===   2.2.2  Randomly mutate                       | # ===   2.2.2  Randomly mutate                       | ||||||
|  |  | ||||||
| # To mutate, we split a codon into it's three nucleotides, then randomly replace | # To mutate, we split a codon into it's three nucleotides, then randomly replace | ||||||
| # one of the three with another nucleotide. | # one of the three with another nucleotide. | ||||||
|  |  | ||||||
| randMut <- function(vC) { | randMut <- function(vC) { | ||||||
|   # Parameter: |   # Parameter: | ||||||
|   #    vC   chr     a vector of codons |   #    vC   chr     a vector of codons | ||||||
|   # Value:  chr     a vector of codons with a single point mutation from vC |   # Value:  chr     a vector of codons with a single point mutation from vC | ||||||
|  |  | ||||||
|   nuc <- c("A", "C", "G", "T") |   nuc <- c("A", "C", "G", "T") | ||||||
|  |  | ||||||
|   for (i in seq_along(vC)) { |   for (i in seq_along(vC)) { | ||||||
|     triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl. |     triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl. | ||||||
|     iNuc <- sample(1:3, 1)                         # choose one of the three |     iNuc <- sample(1:3, 1)                         # choose one of the three | ||||||
|     mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide |     mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide | ||||||
|     triplet[iNuc] <- mutNuc                        # replace the original |     triplet[iNuc] <- mutNuc                        # replace the original | ||||||
|     vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon |     vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon | ||||||
|   } |   } | ||||||
|   return(vC) |   return(vC) | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ===   2.2.3  Forward- translate                    | # ===   2.2.3  Forward- translate                    | ||||||
|  |  | ||||||
| traFor <- function(vC, GC) { | traFor <- function(vC, GC) { | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #      vC   chr   a codon vector |   #      vC   chr   a codon vector | ||||||
|   #      GC   chr   a genetic code |   #      GC   chr   a genetic code | ||||||
|   # Value: |   # Value: | ||||||
|   #      A vector of amino acids |   #      A vector of amino acids | ||||||
|   vAA <- character(length(vC)) |   vAA <- character(length(vC)) | ||||||
|  |  | ||||||
|   for (i in seq_along(vC)) { |   for (i in seq_along(vC)) { | ||||||
|     vAA[i] <- GC[vC[i]]         # translate and store |     vAA[i] <- GC[vC[i]]         # translate and store | ||||||
|   } |   } | ||||||
|   return(vAA) |   return(vAA) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # ===   2.2.4  measure effect                        | # ===   2.2.4  measure effect                        | ||||||
|  |  | ||||||
| # How do we evaluate the effect of the mutation? We'll take a simple ad hoc | # How do we evaluate the effect of the mutation? We'll take a simple ad hoc | ||||||
| # approach: we divide amino acids into hydrophobic, hydrophilic, and neutral | # approach: we divide amino acids into hydrophobic, hydrophilic, and neutral | ||||||
| # categories, according to their free energy of transfer from water to octanol: | # categories, according to their free energy of transfer from water to octanol: | ||||||
| aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") | aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") | ||||||
| aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") | aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") | ||||||
| aaNeutral <- c("A", "H", "T", "S", "V", "G") | aaNeutral <- c("A", "H", "T", "S", "V", "G") | ||||||
|  |  | ||||||
| # Then we will penalize as follows: | # Then we will penalize as follows: | ||||||
| # Changes within one category: 0.1 | # Changes within one category: 0.1 | ||||||
| # Changes from hydrophobic or hydrophilic to neutral or back: 0.3 | # Changes from hydrophobic or hydrophilic to neutral or back: 0.3 | ||||||
| # Changes from hydrophobic to hydrophilic or back: 1.0 | # Changes from hydrophobic to hydrophilic or back: 1.0 | ||||||
| # Changes to stop-codon: 3.0 | # Changes to stop-codon: 3.0 | ||||||
|  |  | ||||||
| evalMut <- function(nat, mut) { | evalMut <- function(nat, mut) { | ||||||
|   # Evaluate severity of mutations between amino acid sequence vectors nat and |   # Evaluate severity of mutations between amino acid sequence vectors nat and | ||||||
|   # mut in an ad hoc approach based on hydrophobicity changes. |   # mut in an ad hoc approach based on hydrophobicity changes. | ||||||
|   aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") |   aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F") | ||||||
|   aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") |   aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R") | ||||||
|   aaNeutral <- c("A", "H", "T", "S", "V", "G") |   aaNeutral <- c("A", "H", "T", "S", "V", "G") | ||||||
|  |  | ||||||
|   penalties <- numeric(length(nat)) |   penalties <- numeric(length(nat)) | ||||||
|   lMut <- nat != mut    # logical TRUE for all mutated positions |   lMut <- nat != mut    # logical TRUE for all mutated positions | ||||||
|  |  | ||||||
|   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1 |   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1 | ||||||
|   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0 |   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0 | ||||||
|   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3 |   penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3 | ||||||
|  |  | ||||||
|   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0 |   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0 | ||||||
|   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1 |   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1 | ||||||
|   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3 |   penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3 | ||||||
|  |  | ||||||
|   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3 |   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3 | ||||||
|   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3 |   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3 | ||||||
|   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1 |   penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1 | ||||||
|  |  | ||||||
|   return(sum(penalties)) |   return(sum(penalties)) | ||||||
| } | } | ||||||
|  |  | ||||||
| # A more sophisticated approach could take additional quantities into account, | # A more sophisticated approach could take additional quantities into account, | ||||||
| # such as charge, size, or flexibility - and it could add heuristics, such as: | # such as charge, size, or flexibility - and it could add heuristics, such as: | ||||||
| # proline is always bad in secondary structure, charged amino acids are terrible | # proline is always bad in secondary structure, charged amino acids are terrible | ||||||
| # in the folded core of a protein, replacing a small by a large amino acid in | # in the folded core of a protein, replacing a small by a large amino acid in | ||||||
| # the core is very disruptive ... etc. | # the core is very disruptive ... etc. | ||||||
| # | # | ||||||
| # For our experiment, we should not  use a mutation data matrix however: | # For our experiment, we should not  use a mutation data matrix however: | ||||||
| # empirical mutation probabilities are superbly suited to estimate evolutionary | # empirical mutation probabilities are superbly suited to estimate evolutionary | ||||||
| # relationships. Here however, as we are trying to evaluate effects of random | # relationships. Here however, as we are trying to evaluate effects of random | ||||||
| # mutations on genetic codes, our reasoning would be circular - we would | # mutations on genetic codes, our reasoning would be circular - we would | ||||||
| # discover that the natural genetic code is optimal ... because it is most | # discover that the natural genetic code is optimal ... because it is most | ||||||
| # similar to the natural genetic code. That would be Cargo Cult bioinformatics. | # similar to the natural genetic code. That would be Cargo Cult bioinformatics. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Run the experiment  ================================================== | # =    3  Run the experiment  ================================================== | ||||||
|  |  | ||||||
| # Fetch the standard Genetic code from Biostrings:: | # Fetch the standard Genetic code from Biostrings:: | ||||||
|  |  | ||||||
| stdCode <- Biostrings::GENETIC_CODE | stdCode <- Biostrings::GENETIC_CODE | ||||||
|  |  | ||||||
| # Fetch the nucleotide sequence for MBP1: | # Fetch the nucleotide sequence for MBP1: | ||||||
|  |  | ||||||
| myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1] | myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1] | ||||||
| myDNA <- paste0(myDNA, collapse = "") | myDNA <- paste0(myDNA, collapse = "") | ||||||
| myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA))) | myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA))) | ||||||
| myDNA <- myDNA[-length(myDNA)]  # drop the stop codon | myDNA <- myDNA[-length(myDNA)]  # drop the stop codon | ||||||
|  |  | ||||||
| myAA <- traFor(myDNA, stdCode) | myAA <- traFor(myDNA, stdCode) | ||||||
|  |  | ||||||
| # Mutate and evaluate | # Mutate and evaluate | ||||||
| set.seed(112358) | set.seed(112358) | ||||||
| x <- randMut(myDNA) | x <- randMut(myDNA) | ||||||
| set.seed(NULL) | set.seed(NULL) | ||||||
| x <- traFor(x, stdCode) | x <- traFor(x, stdCode) | ||||||
| evalMut(myAA, x)  # 166.4 | evalMut(myAA, x)  # 166.4 | ||||||
|  |  | ||||||
| # Try this 200 times, and see how the values are distributed. | # Try this 200 times, and see how the values are distributed. | ||||||
| N <- 200 | N <- 200 | ||||||
| valSTDC <- numeric(N) | valSTDC <- numeric(N) | ||||||
|  |  | ||||||
| set.seed(112358)                   # set RNG seed for repeatable randomness | set.seed(112358)                   # set RNG seed for repeatable randomness | ||||||
| for (i in 1:N) {                   # this takes a few seconds ... | for (i in 1:N) {                   # this takes a few seconds ... | ||||||
|   x <- randMut(myDNA)              # mutate |   x <- randMut(myDNA)              # mutate | ||||||
|   x <- traFor(x, stdCode)     # translate |   x <- traFor(x, stdCode)     # translate | ||||||
|   valSTDC[i] <- evalMut(myAA, x)    # evaluate |   valSTDC[i] <- evalMut(myAA, x)    # evaluate | ||||||
| } | } | ||||||
| set.seed(NULL)                     # reset the RNG | set.seed(NULL)                     # reset the RNG | ||||||
|  |  | ||||||
| hist(valSTDC, | hist(valSTDC, | ||||||
|      breaks = 15, |      breaks = 15, | ||||||
|      col = "palegoldenrod", |      col = "palegoldenrod", | ||||||
|      xlim = c(0, 400), |      xlim = c(0, 400), | ||||||
|      ylim = c(0, N/4), |      ylim = c(0, N/4), | ||||||
|      main = "Standard vs. Synthetic Genetic Code", |      main = "Standard vs. Synthetic Genetic Code", | ||||||
|      xlab = "Mutation penalty") |      xlab = "Mutation penalty") | ||||||
|  |  | ||||||
| # This looks like a normal distribution. Let's assume the effect of mutations | # This looks like a normal distribution. Let's assume the effect of mutations | ||||||
| # under the standard genetic code is the mean of this distribution: | # under the standard genetic code is the mean of this distribution: | ||||||
| effectSTDC <- mean(valSTDC)  # 178.1 | effectSTDC <- mean(valSTDC)  # 178.1 | ||||||
|  |  | ||||||
| # Now we can look at the effects of alternate genetic codes: | # Now we can look at the effects of alternate genetic codes: | ||||||
|  |  | ||||||
| set.seed(112358) | set.seed(112358) | ||||||
| # choose a new code | # choose a new code | ||||||
| GC <- randomGC(stdCode) | GC <- randomGC(stdCode) | ||||||
| set.seed(NULL) | set.seed(NULL) | ||||||
|  |  | ||||||
| # reverse translate hypothetical sequence according to the new code | # reverse translate hypothetical sequence according to the new code | ||||||
| x <- traRev(myAA, GC) | x <- traRev(myAA, GC) | ||||||
|  |  | ||||||
| x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence | x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence | ||||||
| x <- traFor(x, GC)     # translate back, with the new code | x <- traFor(x, GC)     # translate back, with the new code | ||||||
| evalMut(myAA, x)       # evaluate mutation effects: 298.5 | evalMut(myAA, x)       # evaluate mutation effects: 298.5 | ||||||
|  |  | ||||||
| # That seems a fair bit higher than what we saw as "effectUGC" | # That seems a fair bit higher than what we saw as "effectUGC" | ||||||
| # Let's try with different genetic codes. 200 trials - but this time every trial | # Let's try with different genetic codes. 200 trials - but this time every trial | ||||||
| # is with a different, synthetic genetic code. | # is with a different, synthetic genetic code. | ||||||
|  |  | ||||||
| N <- 200 | N <- 200 | ||||||
| valXGC <- numeric(N) | valXGC <- numeric(N) | ||||||
|  |  | ||||||
| set.seed(1414214)                # set RNG seed for repeatable randomness | set.seed(1414214)                # set RNG seed for repeatable randomness | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   GC <- randomGC(stdCode)   # Choose code |   GC <- randomGC(stdCode)   # Choose code | ||||||
|   x <- traRev(myAA, GC)          # reverse translate |   x <- traRev(myAA, GC)          # reverse translate | ||||||
|   x <- randMut(x)                # mutate |   x <- randMut(x)                # mutate | ||||||
|   x <- traFor(x, GC)             # translate |   x <- traFor(x, GC)             # translate | ||||||
|   valXGC[i] <- evalMut(myAA, x)  # evaluate |   valXGC[i] <- evalMut(myAA, x)  # evaluate | ||||||
| } | } | ||||||
| set.seed(NULL)                   # reset the RNG | set.seed(NULL)                   # reset the RNG | ||||||
|  |  | ||||||
| hist(valXGC, | hist(valXGC, | ||||||
|      col = "plum", |      col = "plum", | ||||||
|      breaks = 15, |      breaks = 15, | ||||||
|      add = TRUE) |      add = TRUE) | ||||||
|  |  | ||||||
| # These two distributions are very widely separated! | # These two distributions are very widely separated! | ||||||
|  |  | ||||||
| # Task: Perform the same experiment with the swapped genetic code. | # Task: Perform the same experiment with the swapped genetic code. | ||||||
| #       Compare the distributions. Interpret the result. | #       Compare the distributions. Interpret the result. | ||||||
|  |  | ||||||
|  |  | ||||||
| # These are simple experiments, under assumptions that can be refined in | # These are simple experiments, under assumptions that can be refined in | ||||||
| # meaningful ways. Yet, even those simple computational experiments show | # meaningful ways. Yet, even those simple computational experiments show | ||||||
| # that the Universal Genetic Code has features that one would predict if | # that the Universal Genetic Code has features that one would predict if | ||||||
| # it has evolved under selective pressure to minimize the effects of mutations. | # it has evolved under selective pressure to minimize the effects of mutations. | ||||||
| # Gradual change under mutation is benificial to evolution, disruptive | # Gradual change under mutation is benificial to evolution, disruptive | ||||||
| # change is not. | # change is not. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Task solutions  ====================================================== | # =    4  Task solutions  ====================================================== | ||||||
|  |  | ||||||
| N <- 200 | N <- 200 | ||||||
| valSGC <- numeric(N) | valSGC <- numeric(N) | ||||||
|  |  | ||||||
| set.seed(2718282)                # set RNG seed for repeatable randomness | set.seed(2718282)                # set RNG seed for repeatable randomness | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   GC <- swappedGC(stdCode)  # Choose code |   GC <- swappedGC(stdCode)  # Choose code | ||||||
|   x <- traRev(myAA, GC)          # reverse translate |   x <- traRev(myAA, GC)          # reverse translate | ||||||
|   x <- randMut(x)                # mutate |   x <- randMut(x)                # mutate | ||||||
|   x <- traFor(x, GC)             # translate |   x <- traFor(x, GC)             # translate | ||||||
|   valSGC[i] <- evalMut(myAA, x)  # evaluate |   valSGC[i] <- evalMut(myAA, x)  # evaluate | ||||||
| } | } | ||||||
| set.seed(NULL)                   # reset the RNG | set.seed(NULL)                   # reset the RNG | ||||||
|  |  | ||||||
| hist(valSGC, | hist(valSGC, | ||||||
|      col = "#6688FF88", |      col = "#6688FF88", | ||||||
|      breaks = 15, |      breaks = 15, | ||||||
|      add = TRUE) |      add = TRUE) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,50 +1,50 @@ | |||||||
| # tocID <- "RPR-Introduction.R" | # tocID <- "RPR-Introduction.R" | ||||||
| # | # | ||||||
| # | # | ||||||
| # Purpose: A Bioinformatics Course: | # Purpose: A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Introduction unit | #              R code accompanying the RPR-Introduction unit | ||||||
| # | # | ||||||
| # Version: 1.0 | # Version: 1.0 | ||||||
| # | # | ||||||
| # Date:    2020-09-18 | # Date:    2020-09-18 | ||||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # V 1.0    Updtaed workflow; live | # V 1.0    Updtaed workflow; live | ||||||
| # V 0.1    First code | # V 0.1    First code | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||||
| # | # | ||||||
| # DO NOT SIMPLY  source()  THESE FILES! | # DO NOT SIMPLY  source()  THESE FILES! | ||||||
|  |  | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| #  going on. That's not how it works ... | #  going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
| # === TASK: Local script | # === TASK: Local script | ||||||
| # | # | ||||||
| # - Open the file myScript.R | # - Open the file myScript.R | ||||||
| # | # | ||||||
| # - Create a section header with a date. | # - Create a section header with a date. | ||||||
| # - Enter an R-expression that will produce the first 11 powers of 2 (starting | # - Enter an R-expression that will produce the first 11 powers of 2 (starting | ||||||
| #     from 0). Not a loop - a single expression. The first number you get must | #     from 0). Not a loop - a single expression. The first number you get must | ||||||
| #     be 1. The last number you get must be 1024. | #     be 1. The last number you get must be 1024. | ||||||
| # | # | ||||||
| # - Save the file in the myScripts folder, and close it. | # - Save the file in the myScripts folder, and close it. | ||||||
| # | # | ||||||
| # - Open the file again, select the expression and type Cmd+Enter (or Cmd+R) | # - Open the file again, select the expression and type Cmd+Enter (or Cmd+R) | ||||||
| #   to execute it. | #   to execute it. | ||||||
| # | # | ||||||
| # - Done | # - Done | ||||||
|  |  | ||||||
| # (This task is meant  to make sure that writing R expressions, saving | # (This task is meant  to make sure that writing R expressions, saving | ||||||
| #  them in scripts, opening script files and executing code in the file works | #  them in scripts, opening script files and executing code in the file works | ||||||
| #  for you. If there is an issue, get in touch.) | #  for you. If there is an issue, get in touch.) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,168 +1,168 @@ | |||||||
| # tocID <- "RPR-PROSITE_POST.R" | # tocID <- "RPR-PROSITE_POST.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2020-09 | # Date:     2017-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Maintenance | #           1.2    2020 Maintenance | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout, | #                      use <package>::<function>() idiom throughout, | ||||||
| #           1.0.1  Updates for slightly changed interfaces | #           1.0.1  Updates for slightly changed interfaces | ||||||
| #           1.0    First ABC units version | #           1.0    First ABC units version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                 Line | #TOC>   Section  Title                                                 Line | ||||||
| #TOC> --------------------------------------------------------------------- | #TOC> --------------------------------------------------------------------- | ||||||
| #TOC>   1        Constructing a POST command from a Web query            43 | #TOC>   1        Constructing a POST command from a Web query            43 | ||||||
| #TOC>   1.1        Task - fetchPrositeFeatures() function               148 | #TOC>   1.1        Task - fetchPrositeFeatures() function               148 | ||||||
| #TOC>   2        Task solutions                                         156 | #TOC>   2        Task solutions                                         156 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Constructing a POST command from a Web query  ======================== | # =    1  Constructing a POST command from a Web query  ======================== | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! requireNamespace("httr", quietly = TRUE)) { | if (! requireNamespace("httr", quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = httr)       # basic information | #  library(help = httr)       # basic information | ||||||
| #  browseVignettes("httr")    # available vignettes | #  browseVignettes("httr")    # available vignettes | ||||||
| #  data(package = "httr")     # available datasets | #  data(package = "httr")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # We have reverse engineered the Web form for a ScanProsite request, and can | # We have reverse engineered the Web form for a ScanProsite request, and can | ||||||
| # construct a valid POST request from knowing the required field names. The POST | # construct a valid POST request from knowing the required field names. The POST | ||||||
| # command is similar to GET(), but we need an explicit request body that | # command is similar to GET(), but we need an explicit request body that | ||||||
| # contains a list of key/value pairs | # contains a list of key/value pairs | ||||||
|  |  | ||||||
| UniProtID <- "P39678" | UniProtID <- "P39678" | ||||||
|  |  | ||||||
| URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" | URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" | ||||||
|  |  | ||||||
| response <- httr::POST(URL, | response <- httr::POST(URL, | ||||||
|                        body = list(meta = "opt1", |                        body = list(meta = "opt1", | ||||||
|                                    meta1_protein = "opt1", |                                    meta1_protein = "opt1", | ||||||
|                                    seq = UniProtID, |                                    seq = UniProtID, | ||||||
|                                    skip = "on", |                                    skip = "on", | ||||||
|                                    output = "tabular")) |                                    output = "tabular")) | ||||||
|  |  | ||||||
| # Send off this request, and you should have a response in a few | # Send off this request, and you should have a response in a few | ||||||
| # seconds. Let's check the status first: | # seconds. Let's check the status first: | ||||||
|  |  | ||||||
| httr::status_code(response)  # If this is not 200, something went wrong and it | httr::status_code(response)  # If this is not 200, something went wrong and it | ||||||
|                              # makes no sense to continue. If this persists, ask |                              # makes no sense to continue. If this persists, ask | ||||||
|                              # on the Discussion Board what to do. |                              # on the Discussion Board what to do. | ||||||
|  |  | ||||||
|  |  | ||||||
| # The text contents of the response is available with the | # The text contents of the response is available with the | ||||||
| # content() function: | # content() function: | ||||||
| httr::content(response, "text") | httr::content(response, "text") | ||||||
|  |  | ||||||
| # ... should show you the same as the page contents that you have seen in the | # ... should show you the same as the page contents that you have seen in the | ||||||
| # browser. Now we need to extract the data from the page. For this simple | # browser. Now we need to extract the data from the page. For this simple | ||||||
| # example we can get away with using regular expressions, but in general we need | # example we can get away with using regular expressions, but in general we need | ||||||
| # a real XML parser to parse HTML. We'll cover that in a later unit. Here, we | # a real XML parser to parse HTML. We'll cover that in a later unit. Here, we | ||||||
| # strsplit() the response into individual lines, since each of our data elements | # strsplit() the response into individual lines, since each of our data elements | ||||||
| # is on its own line, and then capture the contents. The way Prosite has | # is on its own line, and then capture the contents. The way Prosite has | ||||||
| # formatted their HTML we can simply split on the "\\n" newline character - but | # formatted their HTML we can simply split on the "\\n" newline character - but | ||||||
| # they could write the same valid HTML without any newline-characters at all. | # they could write the same valid HTML without any newline-characters at all. | ||||||
| # Understand that we are working with a bit of a "hack" here: exploting | # Understand that we are working with a bit of a "hack" here: exploting | ||||||
| # empirical assumptions rather than a formal specification. But sometimes quick | # empirical assumptions rather than a formal specification. But sometimes quick | ||||||
| # and dirty is fine, because quick. | # and dirty is fine, because quick. | ||||||
|  |  | ||||||
| lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) | lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) | ||||||
| head(lines) | head(lines) | ||||||
|  |  | ||||||
| # Now we define a query pattern for the lines we want: | # Now we define a query pattern for the lines we want: | ||||||
| # we can use the uID, bracketed by two "|" pipe | # we can use the uID, bracketed by two "|" pipe | ||||||
| # characters: | # characters: | ||||||
|  |  | ||||||
| patt <- sprintf("\\|%s\\|", UniProtID) | patt <- sprintf("\\|%s\\|", UniProtID) | ||||||
|  |  | ||||||
| # ... and select only the lines that match this | # ... and select only the lines that match this | ||||||
| # pattern: | # pattern: | ||||||
|  |  | ||||||
| ( lines <- lines[grep(patt, lines)] ) | ( lines <- lines[grep(patt, lines)] ) | ||||||
|  |  | ||||||
| # ... captures the three lines of output. | # ... captures the three lines of output. | ||||||
|  |  | ||||||
| # Now we break the lines apart into tokens: this is another application of | # Now we break the lines apart into tokens: this is another application of | ||||||
| # strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs | # strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs | ||||||
| # "\t". Look at the regex "\\t|\\|" in the strsplit() call: | # "\t". Look at the regex "\\t|\\|" in the strsplit() call: | ||||||
|  |  | ||||||
| unlist(strsplit(lines[1], "\\t|\\|")) | unlist(strsplit(lines[1], "\\t|\\|")) | ||||||
|  |  | ||||||
| # Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped | # Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped | ||||||
| # with a backslash. "t" has to be escaped because we want to match a tab (\t), | # with a backslash. "t" has to be escaped because we want to match a tab (\t), | ||||||
| # not the literal character "t". And "|" has to be escaped because we mean the | # not the literal character "t". And "|" has to be escaped because we mean the | ||||||
| # literal pipe character, not its metacharacter meaning OR. Thus sometimes the | # literal pipe character, not its metacharacter meaning OR. Thus sometimes the | ||||||
| # backslash turns a special meaning off, and sometimes it turns a special | # backslash turns a special meaning off, and sometimes it turns a special | ||||||
| # meaning on. Unfortunately there's no easy way to tell - you just need to | # meaning on. Unfortunately there's no easy way to tell - you just need to | ||||||
| # remember the characters - or have a reference handy. The metacharacters are | # remember the characters - or have a reference handy. The metacharacters are | ||||||
| # (){}[]^$?*+.|&-   ... and some of them have different meanings depending on | # (){}[]^$?*+.|&-   ... and some of them have different meanings depending on | ||||||
| # where in the regex they are. | # where in the regex they are. | ||||||
|  |  | ||||||
| # Let's put the tokens into named slots of a data frame | # Let's put the tokens into named slots of a data frame | ||||||
|  |  | ||||||
| features <- data.frame() | features <- data.frame() | ||||||
| for (line in lines) { | for (line in lines) { | ||||||
|   tokens <- unlist(strsplit(line, "\\t|\\|")) |   tokens <- unlist(strsplit(line, "\\t|\\|")) | ||||||
|   features <- rbind(features, |   features <- rbind(features, | ||||||
|                     data.frame(uID   =  tokens[2], |                     data.frame(uID   =  tokens[2], | ||||||
|                                start =  as.numeric(tokens[4]), |                                start =  as.numeric(tokens[4]), | ||||||
|                                end   =  as.numeric(tokens[5]), |                                end   =  as.numeric(tokens[5]), | ||||||
|                                psID  =  tokens[6], |                                psID  =  tokens[6], | ||||||
|                                psName = tokens[7], |                                psName = tokens[7], | ||||||
|                                psSeq  = tokens[11])) |                                psSeq  = tokens[11])) | ||||||
| } | } | ||||||
| features | features | ||||||
|  |  | ||||||
| #  This forms the base of a function that collects the features automatically | #  This forms the base of a function that collects the features automatically | ||||||
| #  from a PrositeScan result. You can write this! | #  from a PrositeScan result. You can write this! | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.1  Task - fetchPrositeFeatures() function  ============================ | # ==   1.1  Task - fetchPrositeFeatures() function  ============================ | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: write a function that takes as input a UniProt ID, fetches the | # Task: write a function that takes as input a UniProt ID, fetches the | ||||||
| # features it contains from ScanProsite and returns a data frame as given above, or | # features it contains from ScanProsite and returns a data frame as given above, or | ||||||
| # an empty data frame if there is an error. | # an empty data frame if there is an error. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Task solutions  ====================================================== | # =    2  Task solutions  ====================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # I have placed such a function into the ABC-dbUtilities.R script: look it up by | # I have placed such a function into the ABC-dbUtilities.R script: look it up by | ||||||
| # clicking on  dbFetchPrositeFeatures() in the Environment pane. | # clicking on  dbFetchPrositeFeatures() in the Environment pane. | ||||||
|  |  | ||||||
| # Test: | # Test: | ||||||
| dbFetchPrositeFeatures("Q5KMQ9") | dbFetchPrositeFeatures("Q5KMQ9") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										270
									
								
								RPR-Pipe.R
									
									
									
									
									
								
							
							
						
						
									
										270
									
								
								RPR-Pipe.R
									
									
									
									
									
								
							| @@ -1,135 +1,135 @@ | |||||||
| # tocID <- "RPR-Pipe.R" | # tocID <- "RPR-Pipe.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              Discussing pipe operators. | #              Discussing pipe operators. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.0 | ||||||
| # | # | ||||||
| # Date:     2021  10 | # Date:     2021  10 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.0    New code | #           1.0    New code | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| #   - find more interesting examples | #   - find more interesting examples | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC> | #TOC> | ||||||
| #TOC>   Section  Title                            Line | #TOC>   Section  Title                            Line | ||||||
| #TOC> ------------------------------------------------ | #TOC> ------------------------------------------------ | ||||||
| #TOC>   1        Pipe  Concept                      41 | #TOC>   1        Pipe  Concept                      41 | ||||||
| #TOC>   2        Nested Expression                  73 | #TOC>   2        Nested Expression                  73 | ||||||
| #TOC>   3        magrittr:: Pipe                    78 | #TOC>   3        magrittr:: Pipe                    78 | ||||||
| #TOC>   4        Base R Pipe                        93 | #TOC>   4        Base R Pipe                        93 | ||||||
| #TOC>   5        Intermediate Assignment           108 | #TOC>   5        Intermediate Assignment           108 | ||||||
| #TOC>   6        Postscript                        127 | #TOC>   6        Postscript                        127 | ||||||
| #TOC> | #TOC> | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Pipe  Concept  ======================================================= | # =    1  Pipe  Concept  ======================================================= | ||||||
|  |  | ||||||
| # Pipes are actually an awesome idea for any code that implements a workflow - | # Pipes are actually an awesome idea for any code that implements a workflow - | ||||||
| # a sequence of operations, each of which transforms data in a specialized way. | # a sequence of operations, each of which transforms data in a specialized way. | ||||||
| # | # | ||||||
| # This principle is familiar from maths: chained functions. If have a function | # This principle is familiar from maths: chained functions. If have a function | ||||||
| # y = f(x) and want to use those results as in z = g(y), I can just write | # y = f(x) and want to use those results as in z = g(y), I can just write | ||||||
| # z = g(f(x)) | # z = g(f(x)) | ||||||
| # | # | ||||||
| # On the unix command line, pipes were used from the very beginning, implemented | # On the unix command line, pipes were used from the very beginning, implemented | ||||||
| # with the "|" pipe character. | # with the "|" pipe character. | ||||||
| # | # | ||||||
| # In R, the magrittr package provided the %>% operator, and recently the |> | # In R, the magrittr package provided the %>% operator, and recently the |> | ||||||
| # operator has been introduced into base R. | # operator has been introduced into base R. | ||||||
| # | # | ||||||
| # However there are alternatives: intermediate assignment, and nested functions | # However there are alternatives: intermediate assignment, and nested functions | ||||||
| # that have always existed in base R anyway. | # that have always existed in base R anyway. | ||||||
| # | # | ||||||
| # Let us look at an example. In writing this, I found out that virtually | # Let us look at an example. In writing this, I found out that virtually | ||||||
| # ALL non-trivial examples I came up with don't translate well into this idiom | # ALL non-trivial examples I came up with don't translate well into this idiom | ||||||
| # at all. It is actually quite limited to simple filtering operations on | # at all. It is actually quite limited to simple filtering operations on | ||||||
| # data. A more interesting example might be added in the future, let me know if | # data. A more interesting example might be added in the future, let me know if | ||||||
| # you have a good idea. | # you have a good idea. | ||||||
| # | # | ||||||
| # A somewhat contrived example is to sort a list of files by the | # A somewhat contrived example is to sort a list of files by the | ||||||
| # length of the file names: | # length of the file names: | ||||||
|  |  | ||||||
| myFiles <- list.files(pattern = "\\.R$") | myFiles <- list.files(pattern = "\\.R$") | ||||||
|  |  | ||||||
| # nchar() gives the number of characters in a string, order() produces indices | # nchar() gives the number of characters in a string, order() produces indices | ||||||
| # that map an array to its sorted form. | # that map an array to its sorted form. | ||||||
| # | # | ||||||
| # =    2  Nested Expression  =================================================== | # =    2  Nested Expression  =================================================== | ||||||
|  |  | ||||||
| myFiles[order(nchar(myFiles))] | myFiles[order(nchar(myFiles))] | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  magrittr:: Pipe  ===================================================== | # =    3  magrittr:: Pipe  ===================================================== | ||||||
|  |  | ||||||
| if (! requireNamespace("magrittr", quietly = TRUE)) { | if (! requireNamespace("magrittr", quietly = TRUE)) { | ||||||
|   install.packages("magrittr") |   install.packages("magrittr") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = magrittr)       # basic information | #  library(help = magrittr)       # basic information | ||||||
| #  browseVignettes("magrittr")    # available vignettes | #  browseVignettes("magrittr")    # available vignettes | ||||||
| #  data(package = "magrittr")     # available datasets | #  data(package = "magrittr")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| library(magrittr) | library(magrittr) | ||||||
|  |  | ||||||
| myFiles  %>% nchar %>% order %>% myFiles[.] | myFiles  %>% nchar %>% order %>% myFiles[.] | ||||||
|  |  | ||||||
| # =    4  Base R Pipe  ========================================================= | # =    4  Base R Pipe  ========================================================= | ||||||
|  |  | ||||||
| # Since version 4.1, base R now supports a pipe operator without the need | # Since version 4.1, base R now supports a pipe operator without the need | ||||||
| # to load a special package. Such an introductions of external functionality | # to load a special package. Such an introductions of external functionality | ||||||
| # into the language is very rare. | # into the language is very rare. | ||||||
| # | # | ||||||
| # Unfortunately it won't (yet) work with the '[' function, so we need to write | # Unfortunately it won't (yet) work with the '[' function, so we need to write | ||||||
| # an intermediate function for this example | # an intermediate function for this example | ||||||
| extract <- function(x, v) { | extract <- function(x, v) { | ||||||
|   return(v[x]) |   return(v[x]) | ||||||
| } | } | ||||||
|  |  | ||||||
| myFiles |> nchar() |> order() |> extract(myFiles) | myFiles |> nchar() |> order() |> extract(myFiles) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    5  Intermediate Assignment  ============================================= | # =    5  Intermediate Assignment  ============================================= | ||||||
|  |  | ||||||
| # So what's the problem? As you can see, the piped code may be concise and | # So what's the problem? As you can see, the piped code may be concise and | ||||||
| # expressive. But there is also a large amount of implicit assignment and | # expressive. But there is also a large amount of implicit assignment and | ||||||
| # processing going on and that is usually a bad idea because it makes code hard | # processing going on and that is usually a bad idea because it makes code hard | ||||||
| # to maintain. I am NOT a big fan of the nested syntax, but I don't think that | # to maintain. I am NOT a big fan of the nested syntax, but I don't think that | ||||||
| # replacing it with the pipe makes things much better. My preferred idiom is | # replacing it with the pipe makes things much better. My preferred idiom is | ||||||
| # to use intermediate assignments. Only then is it convenient to examine | # to use intermediate assignments. Only then is it convenient to examine | ||||||
| # the code step by step and validate every single step. And that is the most | # the code step by step and validate every single step. And that is the most | ||||||
| # important objective at all: no code is good if it does not compute | # important objective at all: no code is good if it does not compute | ||||||
| # correctly. | # correctly. | ||||||
|  |  | ||||||
|  |  | ||||||
| x <- nchar(myFiles) | x <- nchar(myFiles) | ||||||
| x <- order(x) | x <- order(x) | ||||||
| myFiles[x] | myFiles[x] | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    6  Postscript  ========================================================== | # =    6  Postscript  ========================================================== | ||||||
|  |  | ||||||
| # I tried to write an example that strips all comments from a list of files, and | # I tried to write an example that strips all comments from a list of files, and | ||||||
| # another example that finds all files that were not yet updated this year | # another example that finds all files that were not yet updated this year | ||||||
| # (according to the "# Date: in the header). Neither examples can be well | # (according to the "# Date: in the header). Neither examples can be well | ||||||
| # written without intermediate assignments, or at least sapply() functions | # written without intermediate assignments, or at least sapply() functions | ||||||
| # that are not simpler at all than the intermediate assignment. | # that are not simpler at all than the intermediate assignment. | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										360
									
								
								RPR-RegEx.R
									
									
									
									
									
								
							
							
						
						
									
										360
									
								
								RPR-RegEx.R
									
									
									
									
									
								
							| @@ -1,180 +1,180 @@ | |||||||
| # tocID <- "RPR-RegEx.R" | # tocID <- "RPR-RegEx.R" | ||||||
| # | # | ||||||
| # Purpose: A Bioinformatics Course: | # Purpose: A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-RegEx unit | #              R code accompanying the RPR-RegEx unit | ||||||
| # | # | ||||||
| # Version: 1.0 | # Version: 1.0 | ||||||
| # | # | ||||||
| # Date:    2017-08  -  2020-09 | # Date:    2017-08  -  2020-09 | ||||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # V 0.1    Maintenance 2020 | # V 0.1    Maintenance 2020 | ||||||
| # V 0.1    First code | # V 0.1    First code | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== | ||||||
| # | # | ||||||
| # DO NOT SIMPLY  source()  THESE FILES! | # DO NOT SIMPLY  source()  THESE FILES! | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| #  going on. That's not how it works ... | #  going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC> | #TOC> | ||||||
| #TOC>   Section  Title                                Line | #TOC>   Section  Title                                Line | ||||||
| #TOC> ---------------------------------------------------- | #TOC> ---------------------------------------------------- | ||||||
| #TOC>   1        A regex example                        41 | #TOC>   1        A regex example                        41 | ||||||
| #TOC>   2        Counting lines                        108 | #TOC>   2        Counting lines                        108 | ||||||
| #TOC>   2.1        Counting C-alpha atoms only         126 | #TOC>   2.1        Counting C-alpha atoms only         126 | ||||||
| #TOC>   3        Code Solutions                        142 | #TOC>   3        Code Solutions                        142 | ||||||
| #TOC>   3.1        Counting atoms                      144 | #TOC>   3.1        Counting atoms                      144 | ||||||
| #TOC>   3.2        Counting C-alpha records            160 | #TOC>   3.2        Counting C-alpha records            160 | ||||||
| #TOC> | #TOC> | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  A regex example  ===================================================== | # =    1  A regex example  ===================================================== | ||||||
|  |  | ||||||
| # The canonical FASTA version of yeast Mbp1 at Uniprot | # The canonical FASTA version of yeast Mbp1 at Uniprot | ||||||
| s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1 | s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1 | ||||||
| MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK | MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK | ||||||
| ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA | ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA | ||||||
| SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR | SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR | ||||||
| KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ | KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ | ||||||
| QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS | QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS | ||||||
| PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY | PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY | ||||||
| FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS | FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS | ||||||
| IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP | IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP | ||||||
| SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT | SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT | ||||||
| ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP | ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP | ||||||
| VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK | VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK | ||||||
| IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR | IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR | ||||||
| QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK | QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK | ||||||
| IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA" | IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA" | ||||||
|  |  | ||||||
| nchar(s) | nchar(s) | ||||||
| # Must be 969 | # Must be 969 | ||||||
|  |  | ||||||
| # Task: Fetch the Uniprot ID by retrieving the first string that appears between | # Task: Fetch the Uniprot ID by retrieving the first string that appears between | ||||||
| # two vertical bars ("pipes") in the header record. | # two vertical bars ("pipes") in the header record. | ||||||
| # | # | ||||||
|  |  | ||||||
| # Develop the regular expression: | # Develop the regular expression: | ||||||
|                       # Just five characters returned, so we know we are using |                       # Just five characters returned, so we know we are using | ||||||
| patt <- "^>(.{5})"    # the right functions | patt <- "^>(.{5})"    # the right functions | ||||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||||
|  |  | ||||||
| patt <- "^>(.*)|"    # everything to the pipe character | patt <- "^>(.*)|"    # everything to the pipe character | ||||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||||
|  |  | ||||||
| # Ooops - "|" is a metacharacter - we must escape it | # Ooops - "|" is a metacharacter - we must escape it | ||||||
|  |  | ||||||
| patt <- "^>(.*)\|"    # using "\|" | patt <- "^>(.*)\|"    # using "\|" | ||||||
| # Ooops - that's not how we escape: must double the \ to send a literal | # Ooops - that's not how we escape: must double the \ to send a literal | ||||||
| # "\" plus the character "|" to the regex engine. | # "\" plus the character "|" to the regex engine. | ||||||
|  |  | ||||||
| patt <- "^>(.*)\\|"    # using "\\|" | patt <- "^>(.*)\\|"    # using "\\|" | ||||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||||
|  |  | ||||||
| # Good. Now let's first match everything that is not a "|", then match a "|" | # Good. Now let's first match everything that is not a "|", then match a "|" | ||||||
| patt <- "^>([^|]*)\\|" | patt <- "^>([^|]*)\\|" | ||||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||||
|  |  | ||||||
| # the same thing again, but capture the second match. And insist that there | # the same thing again, but capture the second match. And insist that there | ||||||
| # must be at least one character captured | # must be at least one character captured | ||||||
|  |  | ||||||
| patt <- "^>[^|]*\\|([^|]+)\\|" | patt <- "^>[^|]*\\|([^|]+)\\|" | ||||||
| # Analyze this pattern: | # Analyze this pattern: | ||||||
| #    ^           anchor the match at the beginning of the line | #    ^           anchor the match at the beginning of the line | ||||||
| #    >           ">" must be the first character | #    >           ">" must be the first character | ||||||
| #    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because | #    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because | ||||||
| #                  we don't know what other versions of the string "sp" | #                  we don't know what other versions of the string "sp" | ||||||
| #                  might appear. Note that within the brackets "|" is NOT a | #                  might appear. Note that within the brackets "|" is NOT a | ||||||
| #                  metacharacter. | #                  metacharacter. | ||||||
| #    \\|         "|" character: ouside of square brackets "|" is a metacharacter | #    \\|         "|" character: ouside of square brackets "|" is a metacharacter | ||||||
| #                  and means "OR"; we need to escape it to match a literal "|". | #                  and means "OR"; we need to escape it to match a literal "|". | ||||||
| #    (           open parenthesis: capture what comes next ... | #    (           open parenthesis: capture what comes next ... | ||||||
| #       [^|]+    all-characters-except-a-vertical-bar, 1 or more times | #       [^|]+    all-characters-except-a-vertical-bar, 1 or more times | ||||||
| #    )           close parenthesis: stop capturing here | #    )           close parenthesis: stop capturing here | ||||||
| #    \\|           second "|" character, escaped | #    \\|           second "|" character, escaped | ||||||
| regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2] | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Counting lines  ====================================================== | # =    2  Counting lines  ====================================================== | ||||||
|  |  | ||||||
| # Task: Write a function that returns the number of atoms in a PDB file. Call it | # Task: Write a function that returns the number of atoms in a PDB file. Call it | ||||||
| #       atomCount(). Sample data is here: | #       atomCount(). Sample data is here: | ||||||
| myPDB <- readLines("./data/0TST.pdb") | myPDB <- readLines("./data/0TST.pdb") | ||||||
|  |  | ||||||
| #       Specification: | #       Specification: | ||||||
| #       Read a file from its path given as the only argument. | #       Read a file from its path given as the only argument. | ||||||
| #       Return the number of lines in that file that begin with "ATOM  " | #       Return the number of lines in that file that begin with "ATOM  " | ||||||
| #       or with "HETATM". | #       or with "HETATM". | ||||||
|  |  | ||||||
| #       Try this. Write a function. Solution code is at the end of this file. | #       Try this. Write a function. Solution code is at the end of this file. | ||||||
| #       Don't peek. | #       Don't peek. | ||||||
|  |  | ||||||
| atomCount("./data/0TST.pdb")  # must return 6 | atomCount("./data/0TST.pdb")  # must return 6 | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   2.1  Counting C-alpha atoms only  ======================================= | # ==   2.1  Counting C-alpha atoms only  ======================================= | ||||||
|  |  | ||||||
| # Task: write a function based on the previous one that matches only CA records, | # Task: write a function based on the previous one that matches only CA records, | ||||||
| #       i.e. it can be used to count the number of amino acids. Don't get | #       i.e. it can be used to count the number of amino acids. Don't get | ||||||
| #       fooled by calcium atoms, or the string CA appearing elsewhere. | #       fooled by calcium atoms, or the string CA appearing elsewhere. | ||||||
| #       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM | #       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM | ||||||
|  |  | ||||||
| #       Specification: | #       Specification: | ||||||
| #       Read a file from its path given as the only argument. | #       Read a file from its path given as the only argument. | ||||||
| #       Return the number of lines in that file that have a C-alpha atom. | #       Return the number of lines in that file that have a C-alpha atom. | ||||||
|  |  | ||||||
| #       Try this. Solution code is at the end of this file. Don't peek. | #       Try this. Solution code is at the end of this file. Don't peek. | ||||||
|  |  | ||||||
| CAcount("./data/0TST.pdb")  # must return 1 | CAcount("./data/0TST.pdb")  # must return 1 | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Code Solutions  ====================================================== | # =    3  Code Solutions  ====================================================== | ||||||
|  |  | ||||||
| # ==   3.1  Counting atoms  ==================================================== | # ==   3.1  Counting atoms  ==================================================== | ||||||
|  |  | ||||||
| atomCount <- function(IN) { | atomCount <- function(IN) { | ||||||
|   # count the number of atoms in a PDB formatted file |   # count the number of atoms in a PDB formatted file | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #     IN  chr  path of the file to read |   #     IN  chr  path of the file to read | ||||||
|   # Value: |   # Value: | ||||||
|   #         numeric  number of lines that match "^ATOM  " or "^HETATM" |   #         numeric  number of lines that match "^ATOM  " or "^HETATM" | ||||||
|   # Note: the regex MUST be anchored to the beginning of the line, otherwise |   # Note: the regex MUST be anchored to the beginning of the line, otherwise | ||||||
|   # it might match somewhere in a comment! |   # it might match somewhere in a comment! | ||||||
|   x <- readLines(IN) |   x <- readLines(IN) | ||||||
|   patt <- "(^ATOM  )|(^HETATM)" |   patt <- "(^ATOM  )|(^HETATM)" | ||||||
|   return(length(grep(patt, x))) |   return(length(grep(patt, x))) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.2  Counting C-alpha records  ========================================== | # ==   3.2  Counting C-alpha records  ========================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| CAcount <- function(IN) { | CAcount <- function(IN) { | ||||||
|   # count the number of C-alpha atoms in a PDB formatted file |   # count the number of C-alpha atoms in a PDB formatted file | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #     IN  chr  path of the file to read |   #     IN  chr  path of the file to read | ||||||
|   # Value: |   # Value: | ||||||
|   #         numeric  number of lines that match " CA " in position 13 - 16 of |   #         numeric  number of lines that match " CA " in position 13 - 16 of | ||||||
|   #                  an ATOM record. |   #                  an ATOM record. | ||||||
|   # Note: the regex MUST be aligned into the right position, otherwise it |   # Note: the regex MUST be aligned into the right position, otherwise it | ||||||
|   #       might match Calcium records! |   #       might match Calcium records! | ||||||
|   x <- readLines(IN) |   x <- readLines(IN) | ||||||
|   patt <- "^ATOM  ...... CA " |   patt <- "^ATOM  ...... CA " | ||||||
|   return(length(grep(patt, x))) |   return(length(grep(patt, x))) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										1658
									
								
								RPR-SX-PDB.R
									
									
									
									
									
								
							
							
						
						
									
										1658
									
								
								RPR-SX-PDB.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,135 +1,135 @@ | |||||||
| # tocID <- "RPR-UniProt_GET.R" | # tocID <- "RPR-UniProt_GET.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2020-09 | # Date:     2017-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and | #           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and | ||||||
| #                  added FASTA headers as attribute | #                  added FASTA headers as attribute | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0    First ABC units version | #           1.0    First ABC units version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                      Line | #TOC>   Section  Title                                      Line | ||||||
| #TOC> ---------------------------------------------------------- | #TOC> ---------------------------------------------------------- | ||||||
| #TOC>   1        UniProt files via GET                        43 | #TOC>   1        UniProt files via GET                        43 | ||||||
| #TOC>   1.1        Task - fetchUniProtSeq() function         105 | #TOC>   1.1        Task - fetchUniProtSeq() function         105 | ||||||
| #TOC>   2        Task solutions                              118 | #TOC>   2        Task solutions                              118 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  UniProt files via GET  =============================================== | # =    1  UniProt files via GET  =============================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Perhaps the simplest example of scripted download is to retrieve a protein | # Perhaps the simplest example of scripted download is to retrieve a protein | ||||||
| # FASTA sequence from UniProt. All we need is to construct an URL with the | # FASTA sequence from UniProt. All we need is to construct an URL with the | ||||||
| # correct UniProt ID. | # correct UniProt ID. | ||||||
|  |  | ||||||
| # An interface between R scripts and Web servers is provided by the httr:: | # An interface between R scripts and Web servers is provided by the httr:: | ||||||
| # package. This sends and receives information via the http protocol, just like | # package. This sends and receives information via the http protocol, just like | ||||||
| # a Web browser. Since this is a short and simple request, the GET verb is the | # a Web browser. Since this is a short and simple request, the GET verb is the | ||||||
| # right tool: | # right tool: | ||||||
|  |  | ||||||
| if (! requireNamespace("httr", quietly = TRUE)) { | if (! requireNamespace("httr", quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = httr)       # basic information | #  library(help = httr)       # basic information | ||||||
| #  browseVignettes("httr")    # available vignettes | #  browseVignettes("httr")    # available vignettes | ||||||
| #  data(package = "httr")     # available datasets | #  data(package = "httr")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # The UniProt ID for Mbp1 is ... | # The UniProt ID for Mbp1 is ... | ||||||
|  |  | ||||||
| UniProtID <- "P39678" | UniProtID <- "P39678" | ||||||
|  |  | ||||||
| # and the base URL to retrieve data is  ... | # and the base URL to retrieve data is  ... | ||||||
| # http://www.uniprot.org/uniprot/ . We can construct a simple URL to | # http://www.uniprot.org/uniprot/ . We can construct a simple URL to | ||||||
| # retrieve a FASTA sequence: | # retrieve a FASTA sequence: | ||||||
|  |  | ||||||
| (URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)) | (URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)) | ||||||
|  |  | ||||||
| # the GET() function from httr will get the data. | # the GET() function from httr will get the data. | ||||||
| response <- httr::GET(URL) | response <- httr::GET(URL) | ||||||
|  |  | ||||||
| str(response) # the response object is a bit complex ... | str(response) # the response object is a bit complex ... | ||||||
| as.character(response) # ... but it is easy to pull out the data. | as.character(response) # ... but it is easy to pull out the data. | ||||||
|  |  | ||||||
| # to process  ... | # to process  ... | ||||||
| x <- as.character(response) | x <- as.character(response) | ||||||
| x <- strsplit(x, "\n") | x <- strsplit(x, "\n") | ||||||
| dbSanitizeSequence(x) | dbSanitizeSequence(x) | ||||||
|  |  | ||||||
| # Simple. | # Simple. | ||||||
| # But what happens if there is an error, e.g. the uniprot ID does not exist? | # But what happens if there is an error, e.g. the uniprot ID does not exist? | ||||||
|  |  | ||||||
| response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta") | response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta") | ||||||
| as.character(response) | as.character(response) | ||||||
| # this is a large HTML page that tells us the URL was not found. So we need to | # this is a large HTML page that tells us the URL was not found. So we need to | ||||||
| # check for errors.  The Right Way to do this is to evaluate the staus code that | # check for errors.  The Right Way to do this is to evaluate the staus code that | ||||||
| # every Web server returns for every transaction. | # every Web server returns for every transaction. | ||||||
| # | # | ||||||
| httr::status_code(response)  # 404 == Page Not Found | httr::status_code(response)  # 404 == Page Not Found | ||||||
|  |  | ||||||
| # There are many possible codes, but the only code we will be happy with | # There are many possible codes, but the only code we will be happy with | ||||||
| # is 200 - oK. | # is 200 - oK. | ||||||
| # (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes ) | # (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes ) | ||||||
|  |  | ||||||
| URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID) | URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID) | ||||||
| response <- httr::GET(URL) | response <- httr::GET(URL) | ||||||
| httr::status_code(response) | httr::status_code(response) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.1  Task - fetchUniProtSeq() function  ================================= | # ==   1.1  Task - fetchUniProtSeq() function  ================================= | ||||||
|  |  | ||||||
| # Task: write a function that | # Task: write a function that | ||||||
| #   - takes as input a vector of UniProt IDs, | #   - takes as input a vector of UniProt IDs, | ||||||
| #   - fetches the FASTA sequence for each | #   - fetches the FASTA sequence for each | ||||||
| #   - returns a vector of the same length as the input, where an element is: | #   - returns a vector of the same length as the input, where an element is: | ||||||
| #   -  ...  the sequence, if the query was successful | #   -  ...  the sequence, if the query was successful | ||||||
| #   -  ...  NA if there was an error | #   -  ...  NA if there was an error | ||||||
| #   - each element has the UniProt ID as the name() | #   - each element has the UniProt ID as the name() | ||||||
| #   - bonus: the output has an attribute "headers" that is a vector of the | #   - bonus: the output has an attribute "headers" that is a vector of the | ||||||
| #            FASTA headers ( cf. ?attr ) | #            FASTA headers ( cf. ?attr ) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Task solutions  ====================================================== | # =    2  Task solutions  ====================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # I have placed such a function - dbFetchUniProtSeq() - into | # I have placed such a function - dbFetchUniProtSeq() - into | ||||||
| # "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq() | # "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq() | ||||||
| # in the Environment pane. | # in the Environment pane. | ||||||
|  |  | ||||||
| # Test this: | # Test this: | ||||||
| ( x <- dbFetchUniProtSeq("P39678") ) | ( x <- dbFetchUniProtSeq("P39678") ) | ||||||
| names(x)[1] | names(x)[1] | ||||||
| attr(x, "headers")[1] | attr(x, "headers")[1] | ||||||
| x[1] | x[1] | ||||||
| cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]), | cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]), | ||||||
|                width = 40), sep = "\n") |                width = 40), sep = "\n") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,234 +1,234 @@ | |||||||
| # tocID <- "RPR-Unit_testing.R" | # tocID <- "RPR-Unit_testing.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Unit_testing unit. | #              R code accompanying the RPR-Unit_testing unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017  10  -  2019  01 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Updates. Discuss local tests. | #           1.2    2020 Updates. Discuss local tests. | ||||||
| #           1.1    Change from require() to requireNamespace() | #           1.1    Change from require() to requireNamespace() | ||||||
| #           1.0    New code | #           1.0    New code | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                             Line | #TOC>   Section  Title                             Line | ||||||
| #TOC> ------------------------------------------------- | #TOC> ------------------------------------------------- | ||||||
| #TOC>   1        Unit Tests with testthat            42 | #TOC>   1        Unit Tests with testthat            42 | ||||||
| #TOC>   2        Organizing your tests              165 | #TOC>   2        Organizing your tests              165 | ||||||
| #TOC>   2.1        Testing scripts                  189 | #TOC>   2.1        Testing scripts                  189 | ||||||
| #TOC>   2.2        Rethinking testing               202 | #TOC>   2.2        Rethinking testing               202 | ||||||
| #TOC>   3        Task solutions                     220 | #TOC>   3        Task solutions                     220 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Unit Tests with testthat  ============================================ | # =    1  Unit Tests with testthat  ============================================ | ||||||
|  |  | ||||||
| # The testthat package supports writing and executing unit tests in many ways. | # The testthat package supports writing and executing unit tests in many ways. | ||||||
|  |  | ||||||
| if (! requireNamespace("testthat", quietly = TRUE)) { | if (! requireNamespace("testthat", quietly = TRUE)) { | ||||||
|   install.packages("testthat") |   install.packages("testthat") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = testthat)       # basic information | #  library(help = testthat)       # basic information | ||||||
| #  browseVignettes("testthat")    # available vignettes | #  browseVignettes("testthat")    # available vignettes | ||||||
| #  data(package = "testthat")     # available datasets | #  data(package = "testthat")     # available datasets | ||||||
|  |  | ||||||
| # testthat is one of those packages that we either use A LOT in a script, | # testthat is one of those packages that we either use A LOT in a script, | ||||||
| # or not at all. Therefore it's more reasonable to depart from our usual | # or not at all. Therefore it's more reasonable to depart from our usual | ||||||
| # <package>::<function>() idiom, and load the entire library. In fact, if | # <package>::<function>() idiom, and load the entire library. In fact, if | ||||||
| # we author packages, it is common practice to load testthat in the part | # we author packages, it is common practice to load testthat in the part | ||||||
| # of the package that automates testing. | # of the package that automates testing. | ||||||
|  |  | ||||||
| library(testthat) | library(testthat) | ||||||
|  |  | ||||||
| # An atomic test consists of an expectation about the bahaviour of a function or | # An atomic test consists of an expectation about the bahaviour of a function or | ||||||
| # the existence of an object. testthat provides a number of useful expectations: | # the existence of an object. testthat provides a number of useful expectations: | ||||||
|  |  | ||||||
| # At the most basic level, you can use expect_true() and expect_false(): | # At the most basic level, you can use expect_true() and expect_false(): | ||||||
|  |  | ||||||
| expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa")) | expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa")) | ||||||
| expect_true(file.exists("NO-SUCH-FILE.txt")) | expect_true(file.exists("NO-SUCH-FILE.txt")) | ||||||
|  |  | ||||||
| expect_false(is.integer(NA)) | expect_false(is.integer(NA)) | ||||||
|  |  | ||||||
| # More commonly, you will test for equality of an output with a given result. | # More commonly, you will test for equality of an output with a given result. | ||||||
| # But you need to consider what it means for two numbers to be "equal" on a | # But you need to consider what it means for two numbers to be "equal" on a | ||||||
| # digital computer. Consider: | # digital computer. Consider: | ||||||
|  |  | ||||||
| 49*(1/49) == 1      # Surprised? Read FAQ 7.31 | 49*(1/49) == 1      # Surprised? Read FAQ 7.31 | ||||||
|                     # https://cran.r-project.org/doc/FAQ/R-FAQ.html |                     # https://cran.r-project.org/doc/FAQ/R-FAQ.html | ||||||
| 49*(1/49) - 1       # NOT zero (but almost) | 49*(1/49) - 1       # NOT zero (but almost) | ||||||
|  |  | ||||||
| # This is really unpredictable ... | # This is really unpredictable ... | ||||||
| 0.1 + 0.05 == 0.15 | 0.1 + 0.05 == 0.15 | ||||||
| 0.2 + 0.07 == 0.27 | 0.2 + 0.07 == 0.27 | ||||||
|  |  | ||||||
| # It's easy to be caught on the wrong foot with numeric comparisons, therefore | # It's easy to be caught on the wrong foot with numeric comparisons, therefore | ||||||
| # R uses the function all.equal() to test whether two numbers are equal for | # R uses the function all.equal() to test whether two numbers are equal for | ||||||
| # practical puposes up to machine precision. | # practical puposes up to machine precision. | ||||||
| 49*(1/49) == 1 | 49*(1/49) == 1 | ||||||
| all.equal(49*(1/49), 1) | all.equal(49*(1/49), 1) | ||||||
|  |  | ||||||
| # The testthat function expect_equal() uses all.equal internally: | # The testthat function expect_equal() uses all.equal internally: | ||||||
| expect_equal(49*(1/49), 1) | expect_equal(49*(1/49), 1) | ||||||
|  |  | ||||||
| # ... which is reasonable, or, if things MUST be exactly the same ... | # ... which is reasonable, or, if things MUST be exactly the same ... | ||||||
| expect_identical(49*(1/49), 1) | expect_identical(49*(1/49), 1) | ||||||
|  |  | ||||||
| # ... but consider: | # ... but consider: | ||||||
| expect_identical(2, 2L) # one is typeof() "double", the other is integer" | expect_identical(2, 2L) # one is typeof() "double", the other is integer" | ||||||
|  |  | ||||||
| # Some very useful expectations are expect_warning(), and expect_error(), for | # Some very useful expectations are expect_warning(), and expect_error(), for | ||||||
| # constructing tests that check for erroneous output: | # constructing tests that check for erroneous output: | ||||||
|  |  | ||||||
| as.integer(c("1", "2", "three")) | as.integer(c("1", "2", "three")) | ||||||
| expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT | expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT | ||||||
|                                                  # printed. |                                                  # printed. | ||||||
| 1/"x" | 1/"x" | ||||||
| expect_warning(1/"x") | expect_warning(1/"x") | ||||||
| expect_error(1/"x")      # Again: note that the error is NOT printed, as well | expect_error(1/"x")      # Again: note that the error is NOT printed, as well | ||||||
|                          # code execution will continue. |                          # code execution will continue. | ||||||
|  |  | ||||||
| # Even better, you can check if the warning or error is what you expect it | # Even better, you can check if the warning or error is what you expect it | ||||||
| # to be - because it could actually have occured somewhere else in your code. | # to be - because it could actually have occured somewhere else in your code. | ||||||
|  |  | ||||||
| v <- c("1", "x") | v <- c("1", "x") | ||||||
| log(v[1:2]) | log(v[1:2]) | ||||||
| expect_error(log(v[1:2]), "non-numeric argument to mathematical function") | expect_error(log(v[1:2]), "non-numeric argument to mathematical function") | ||||||
| expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message. | expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message. | ||||||
| expect_error(log(v[1,2]))                # This appears oK, but ... | expect_error(log(v[1,2]))                # This appears oK, but ... | ||||||
| expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error! | expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error! | ||||||
|  |  | ||||||
| # Producing unit tests simply means: we define a function, and then we check | # Producing unit tests simply means: we define a function, and then we check | ||||||
| # whether all test pass. Consider a function that is loaded on startup from | # whether all test pass. Consider a function that is loaded on startup from | ||||||
| # the .utilities.R script: | # the .utilities.R script: | ||||||
|  |  | ||||||
| biCode | biCode | ||||||
|  |  | ||||||
| # We could test it like so: | # We could test it like so: | ||||||
|  |  | ||||||
| expect_equal(biCode(""), ".....") | expect_equal(biCode(""), ".....") | ||||||
| expect_equal(biCode(" "), ".....") | expect_equal(biCode(" "), ".....") | ||||||
| expect_equal(biCode("123 12"), ".....") | expect_equal(biCode("123 12"), ".....") | ||||||
| expect_equal(biCode("h sapiens"), "H..SA") | expect_equal(biCode("h sapiens"), "H..SA") | ||||||
| expect_equal(biCode("homo sapiens"), "HOMSA") | expect_equal(biCode("homo sapiens"), "HOMSA") | ||||||
| expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") | expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") | ||||||
| expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), | expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), | ||||||
|              c("PHACI", "MACRU")) |              c("PHACI", "MACRU")) | ||||||
| expect_error(biCode(), "argument \"s\" is missing, with no default") | expect_error(biCode(), "argument \"s\" is missing, with no default") | ||||||
|  |  | ||||||
| # The test_that() function allows to group related tests, include an informative | # The test_that() function allows to group related tests, include an informative | ||||||
| # message which test is being executed, and run a number of tests that are | # message which test is being executed, and run a number of tests that are | ||||||
| # passed to the function inside a code block - i.e. {...} | # passed to the function inside a code block - i.e. {...} | ||||||
| # test_that("<descriptive string>, {<code block>}) | # test_that("<descriptive string>, {<code block>}) | ||||||
|  |  | ||||||
| test_that("NA values are preserved", { | test_that("NA values are preserved", { | ||||||
|   # bicode() respects vector length: input and output must have the smae length. |   # bicode() respects vector length: input and output must have the smae length. | ||||||
|   # Therefore NA's can't be simply skipped, bust must be properly passed |   # Therefore NA's can't be simply skipped, bust must be properly passed | ||||||
|   # into output: |   # into output: | ||||||
|   expect_true(is.na((biCode(NA)))) |   expect_true(is.na((biCode(NA)))) | ||||||
|   expect_equal(biCode(c("first", NA, "last")), |   expect_equal(biCode(c("first", NA, "last")), | ||||||
|                c("FIRST", NA, "LAST.")) |                c("FIRST", NA, "LAST.")) | ||||||
| }) | }) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Task: Write a function calcGC() that calculates GC content in a sequence. | # Task: Write a function calcGC() that calculates GC content in a sequence. | ||||||
| #       Hint: you could strsplit() the sequence into a vector, and count | #       Hint: you could strsplit() the sequence into a vector, and count | ||||||
| #       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove | #       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove | ||||||
| #       A's and T's, and use nchar() before and after to calculate the content | #       A's and T's, and use nchar() before and after to calculate the content | ||||||
| #       from the length difference. | #       from the length difference. | ||||||
| #       Then write tests that: | #       Then write tests that: | ||||||
| #          confirm that calcGC("AATT") is 0; | #          confirm that calcGC("AATT") is 0; | ||||||
| #          confirm that calcGC("ATGC") is 0.5; | #          confirm that calcGC("ATGC") is 0.5; | ||||||
| #          confirm that calcGC("AC")   is 0.5; | #          confirm that calcGC("AC")   is 0.5; | ||||||
| #          confirm that calcGC("CGCG") is 1; | #          confirm that calcGC("CGCG") is 1; | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Organizing your tests  =============================================== | # =    2  Organizing your tests  =============================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Tests are only useful if they are actually executed and we need to make sure | # Tests are only useful if they are actually executed and we need to make sure | ||||||
| # there are no barriers to do that. The testthat package supports automatic | # there are no barriers to do that. The testthat package supports automatic | ||||||
| # execution of tests: | # execution of tests: | ||||||
| #  - put your tests into an R-script, | #  - put your tests into an R-script, | ||||||
| #  - save your tests in a file called "test_<my-function-name>.R" | #  - save your tests in a file called "test_<my-function-name>.R" | ||||||
| #  - execute the test with test_file("test_<my-function-name>.R") ... | #  - execute the test with test_file("test_<my-function-name>.R") ... | ||||||
| #  ... or, if you are working on a project ... | #  ... or, if you are working on a project ... | ||||||
| #  - place the file in a test-directory (e.g. the directory "test" in this | #  - place the file in a test-directory (e.g. the directory "test" in this | ||||||
| #      project), | #      project), | ||||||
| #  - execute all your tests with test_dir("<my-test-directory>") | #  - execute all your tests with test_dir("<my-test-directory>") | ||||||
|  |  | ||||||
| # For example I have provided a "tests" directory with this project, and | # For example I have provided a "tests" directory with this project, and | ||||||
| # placed the file "test_biCode.R" inside. | # placed the file "test_biCode.R" inside. | ||||||
| file.show("./tests/test_biCode.R") | file.show("./tests/test_biCode.R") | ||||||
|  |  | ||||||
| # Execute the file ... | # Execute the file ... | ||||||
| test_file("./tests/test_biCode.R") | test_file("./tests/test_biCode.R") | ||||||
|  |  | ||||||
| # .. or execute all the test files in the directory: | # .. or execute all the test files in the directory: | ||||||
| test_dir("./tests") | test_dir("./tests") | ||||||
|  |  | ||||||
| # ==   2.1  Testing scripts  =================================================== | # ==   2.1  Testing scripts  =================================================== | ||||||
|  |  | ||||||
| # Scripts need special consideration since we do not necessarily source() them | # Scripts need special consideration since we do not necessarily source() them | ||||||
| # entirely. Therefore automated testing is not reasonable. What you can do | # entirely. Therefore automated testing is not reasonable. What you can do | ||||||
| # instead is to place a conditional block at the end of your script, that | # instead is to place a conditional block at the end of your script, that | ||||||
| # never gets executed - then you can manually execute the code in the block | # never gets executed - then you can manually execute the code in the block | ||||||
| # whenever you wish to test your functions. For example: | # whenever you wish to test your functions. For example: | ||||||
|  |  | ||||||
| if (FALSE) { | if (FALSE) { | ||||||
|   # ... your tests go here |   # ... your tests go here | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| # ==   2.2  Rethinking testing  ================================================ | # ==   2.2  Rethinking testing  ================================================ | ||||||
|  |  | ||||||
| # However, it is important to keep in mind that different objectives lead to | # However, it is important to keep in mind that different objectives lead to | ||||||
| # different ideas of what works best. There is never a "best" in and of itself, | # different ideas of what works best. There is never a "best" in and of itself, | ||||||
| # the question is always: "Best for what?" While automated unit testing is a | # the question is always: "Best for what?" While automated unit testing is a | ||||||
| # great way to assure the integrity of packages and larger software artefacts as | # great way to assure the integrity of packages and larger software artefacts as | ||||||
| # they are being developed, more loosely conceived aggregates of code - like the | # they are being developed, more loosely conceived aggregates of code - like the | ||||||
| # scripts for this course for example - have different objectives and in this | # scripts for this course for example - have different objectives and in this | ||||||
| # case I find the testthat approach to actually be inferior. The reason is its | # case I find the testthat approach to actually be inferior. The reason is its | ||||||
| # tendency to physically separate code and tests. Keeping assets, and functions | # tendency to physically separate code and tests. Keeping assets, and functions | ||||||
| # that operate on those assets separated is always poor design. I have found | # that operate on those assets separated is always poor design. I have found | ||||||
| # over time that a more stable approach is to move individual functions into | # over time that a more stable approach is to move individual functions into | ||||||
| # their individual scripts, all in one folder, one function (and its helpers) | # their individual scripts, all in one folder, one function (and its helpers) | ||||||
| # per file, and examples, demos and tests in an if (FALSE) { ... } block, as | # per file, and examples, demos and tests in an if (FALSE) { ... } block, as | ||||||
| # explained above. | # explained above. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Task solutions  ====================================================== | # =    3  Task solutions  ====================================================== | ||||||
|  |  | ||||||
| calcGC <- function(s) { | calcGC <- function(s) { | ||||||
|   s <- gsub("[^agctAGCT]", "", s) |   s <- gsub("[^agctAGCT]", "", s) | ||||||
|   return(nchar(gsub("[atAT]", "", s)) / nchar(s)) |   return(nchar(gsub("[atAT]", "", s)) / nchar(s)) | ||||||
| } | } | ||||||
|  |  | ||||||
| expect_equal(calcGC("AATT"), 0) | expect_equal(calcGC("AATT"), 0) | ||||||
| expect_equal(calcGC("ATGC"), 0.5) | expect_equal(calcGC("ATGC"), 0.5) | ||||||
| expect_equal(calcGC("AC"),   0.5) | expect_equal(calcGC("AC"),   0.5) | ||||||
| expect_equal(calcGC("CGCG"), 1) | expect_equal(calcGC("CGCG"), 1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										332
									
								
								RPR-eUtils_XML.R
									
									
									
									
									
								
							
							
						
						
									
										332
									
								
								RPR-eUtils_XML.R
									
									
									
									
									
								
							| @@ -1,166 +1,166 @@ | |||||||
| # tocID <- "RPR-eUtils_XML.R" | # tocID <- "RPR-eUtils_XML.R" | ||||||
| # | # | ||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||||
| # | # | ||||||
| # Version:  1.2.1 | # Version:  1.2.1 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2021-09 | # Date:     2017-10  -  2021-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2.1  2021 Maintenance | #           1.2.1  2021 Maintenance | ||||||
| #           1.2    2020 Updates | #           1.2    2020 Updates | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0    First ABC units version | #           1.0    First ABC units version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # | # | ||||||
| # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | # == DO NOT SIMPLY  source()  THIS FILE! ======================================= | ||||||
| # | # | ||||||
| # If there are portions you don't understand, use R's help system, Google for an | # If there are portions you don't understand, use R's help system, Google for an | ||||||
| # answer, or ask your instructor. Don't continue if you don't understand what's | # answer, or ask your instructor. Don't continue if you don't understand what's | ||||||
| # going on. That's not how it works ... | # going on. That's not how it works ... | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                       Line | #TOC>   Section  Title                                       Line | ||||||
| #TOC> ----------------------------------------------------------- | #TOC> ----------------------------------------------------------- | ||||||
| #TOC>   1        Working with NCBI eUtils                      43 | #TOC>   1        Working with NCBI eUtils                      43 | ||||||
| #TOC>   1.1        Task - fetchNCBItaxData() function         145 | #TOC>   1.1        Task - fetchNCBItaxData() function         145 | ||||||
| #TOC>   2        Task solutions                               152 | #TOC>   2        Task solutions                               152 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Working with NCBI eUtils  ============================================ | # =    1  Working with NCBI eUtils  ============================================ | ||||||
|  |  | ||||||
|  |  | ||||||
| # To begin, we load the xml2 package that contains functions | # To begin, we load the xml2 package that contains functions | ||||||
| # we need to receive and parse html data. NCBI's eUtils send information in | # we need to receive and parse html data. NCBI's eUtils send information in | ||||||
| # XML format so we need to be able to parse XML. | # XML format so we need to be able to parse XML. | ||||||
| if (! requireNamespace("xml2", quietly=TRUE)) { | if (! requireNamespace("xml2", quietly=TRUE)) { | ||||||
|   install.packages("xml2") |   install.packages("xml2") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = xml2)       # basic information | #  library(help = xml2)       # basic information | ||||||
| #  browseVignettes("xml2")    # available vignettes | #  browseVignettes("xml2")    # available vignettes | ||||||
| #  data(package = "xml2")     # available datasets | #  data(package = "xml2")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # We will walk through the process with the refSeqID | # We will walk through the process with the refSeqID | ||||||
| # of yeast Mbp1 | # of yeast Mbp1 | ||||||
| refSeqID <- "NP_010227" | refSeqID <- "NP_010227" | ||||||
|  |  | ||||||
|  |  | ||||||
| # First we build a query URL... | # First we build a query URL... | ||||||
| eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" | eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" | ||||||
|  |  | ||||||
|  |  | ||||||
| # Then we assemble an URL that will search for get the | # Then we assemble an URL that will search for get the | ||||||
| # unique, NCBI internal identifier, | # unique, NCBI internal identifier, | ||||||
| # for our refSeqID... | # for our refSeqID... | ||||||
| URL <- paste(eUtilsBase, | URL <- paste(eUtilsBase, | ||||||
|              "esearch.fcgi?",     # ...using the esearch program |              "esearch.fcgi?",     # ...using the esearch program | ||||||
|                                   # that finds an entry in an |                                   # that finds an entry in an | ||||||
|                                   # NCBI database |                                   # NCBI database | ||||||
|              "db=protein", |              "db=protein", | ||||||
|              "&term=", refSeqID, |              "&term=", refSeqID, | ||||||
|              sep="") |              sep="") | ||||||
| # Copy the URL and paste it into your browser to see | # Copy the URL and paste it into your browser to see | ||||||
| # what the response should look like. | # what the response should look like. | ||||||
| URL | URL | ||||||
|  |  | ||||||
| # To fetch a response in R, we use the function read_xml() | # To fetch a response in R, we use the function read_xml() | ||||||
| # with our URL as its argument. | # with our URL as its argument. | ||||||
| ( myXML <- xml2::read_xml(URL) ) | ( myXML <- xml2::read_xml(URL) ) | ||||||
|  |  | ||||||
| # This is XML. We can take the response apart into | # This is XML. We can take the response apart into | ||||||
| # its individual components with the as_list() function. | # its individual components with the as_list() function. | ||||||
|  |  | ||||||
| xml2::as_list(myXML) | xml2::as_list(myXML) | ||||||
|  |  | ||||||
| # Note how the XML "tree" is represented as a list of | # Note how the XML "tree" is represented as a list of | ||||||
| # lists of lists ... | # lists of lists ... | ||||||
| # If we know exactly what element we are looking for, | # If we know exactly what element we are looking for, | ||||||
| # we can extract it from this structure: | # we can extract it from this structure: | ||||||
| xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]] | xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]] | ||||||
|  |  | ||||||
| # But this is not very robust, it would break with the | # But this is not very robust, it would break with the | ||||||
| # slightest change that the NCBI makes to their data format - | # slightest change that the NCBI makes to their data format - | ||||||
| # and the NCBI changes things A LOT! | # and the NCBI changes things A LOT! | ||||||
|  |  | ||||||
| # Somewhat more robust is to specify the type of element | # Somewhat more robust is to specify the type of element | ||||||
| # we want - its the text contained in an <Id>...</Id> | # we want - its the text contained in an <Id>...</Id> | ||||||
| # element, and use the XPath XML parsing language to | # element, and use the XPath XML parsing language to | ||||||
| # retrieve it. | # retrieve it. | ||||||
|  |  | ||||||
| xml2::xml_find_all(myXML, "//Id") # returns a "node set" | xml2::xml_find_all(myXML, "//Id") # returns a "node set" | ||||||
|  |  | ||||||
| xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents | xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents | ||||||
|                                                   # of the node set |                                                   # of the node set | ||||||
|  |  | ||||||
| # We will need to do this more than once, so we write a function | # We will need to do this more than once, so we write a function | ||||||
| # for it... | # for it... | ||||||
| node2text <- function(doc, tag) { | node2text <- function(doc, tag) { | ||||||
|   # an extractor function for the contents of elements |   # an extractor function for the contents of elements | ||||||
|   # between given tags in an XML response. |   # between given tags in an XML response. | ||||||
|   # Contents of all matching elements is returned in |   # Contents of all matching elements is returned in | ||||||
|   # a vector of strings. |   # a vector of strings. | ||||||
|   path <- paste0("//", tag) |   path <- paste0("//", tag) | ||||||
|   nodes <- xml2::xml_find_all(doc, path) |   nodes <- xml2::xml_find_all(doc, path) | ||||||
|   return(xml2::xml_text(nodes)) |   return(xml2::xml_text(nodes)) | ||||||
| } | } | ||||||
|  |  | ||||||
| # using node2text() ... | # using node2text() ... | ||||||
| (GID <- node2text(myXML, "Id")) | (GID <- node2text(myXML, "Id")) | ||||||
|  |  | ||||||
| # The GI is the pivot for data requests at the | # The GI is the pivot for data requests at the | ||||||
| # NCBI. | # NCBI. | ||||||
|  |  | ||||||
| # Let's first get the associated data for this GI | # Let's first get the associated data for this GI | ||||||
| URL <- paste0(eUtilsBase, | URL <- paste0(eUtilsBase, | ||||||
|               "esummary.fcgi?", |               "esummary.fcgi?", | ||||||
|               "db=protein", |               "db=protein", | ||||||
|               "&id=", |               "&id=", | ||||||
|               GID, |               GID, | ||||||
|               "&version=2.0") |               "&version=2.0") | ||||||
| (myXML <- xml2::read_xml(URL)) | (myXML <- xml2::read_xml(URL)) | ||||||
|  |  | ||||||
| (taxID <- node2text(myXML, "TaxId")) | (taxID <- node2text(myXML, "TaxId")) | ||||||
| (organism <- node2text(myXML, "Organism")) | (organism <- node2text(myXML, "Organism")) | ||||||
|  |  | ||||||
| #  This forms the base of a function that gets taxonomy data | #  This forms the base of a function that gets taxonomy data | ||||||
| #  from an Entrez result. You can write this! | #  from an Entrez result. You can write this! | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.1  Task - fetchNCBItaxData() function  ================================ | # ==   1.1  Task - fetchNCBItaxData() function  ================================ | ||||||
|  |  | ||||||
| # Task: write a function that takes as input a RefSeq ID, fetches the taxonomy | # Task: write a function that takes as input a RefSeq ID, fetches the taxonomy | ||||||
| # information, returns a list with taxID and organism, if the operation is | # information, returns a list with taxID and organism, if the operation is | ||||||
| # successful, or a list of length 0 if there is an error. | # successful, or a list of length 0 if there is an error. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Task solutions  ====================================================== | # =    2  Task solutions  ====================================================== | ||||||
|  |  | ||||||
| # I have placed such a function into the dbUtilities script: look it up by | # I have placed such a function into the dbUtilities script: look it up by | ||||||
| # clicking on  dbFetchNCBItaxData() in the Environment pane. | # clicking on  dbFetchNCBItaxData() in the Environment pane. | ||||||
|  |  | ||||||
| # Test: | # Test: | ||||||
| dbFetchNCBItaxData("XP_001837394") | dbFetchNCBItaxData("XP_001837394") | ||||||
|  |  | ||||||
| # Expected outout: | # Expected outout: | ||||||
| # ---------------- | # ---------------- | ||||||
| # taxID                         organism | # taxID                         organism | ||||||
| # 1 240176 Coprinopsis cinerea okayama7#130 | # 1 240176 Coprinopsis cinerea okayama7#130 | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,10 +1,10 @@ | |||||||
| HEADER   TEST                                                 0TST      0TST   1 | HEADER   TEST                                                 0TST      0TST   1 | ||||||
| REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2 | REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2 | ||||||
| ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3 | ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3 | ||||||
| ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4 | ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4 | ||||||
| ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5 | ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5 | ||||||
| ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6 | ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6 | ||||||
| TER       5      GLY     1                                              0TST   7 | TER       5      GLY     1                                              0TST   7 | ||||||
| HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8 | HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8 | ||||||
| HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9 | HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9 | ||||||
| END                                                                     0TST  10 | END                                                                     0TST  10 | ||||||
|   | |||||||
							
								
								
									
										3104
									
								
								data/1BM8.pdb
									
									
									
									
									
								
							
							
						
						
									
										3104
									
								
								data/1BM8.pdb
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,5 +1,5 @@ | |||||||
| >2F1C:X|PDBID|CHAIN|SEQUENCE | >2F1C:X|PDBID|CHAIN|SEQUENCE | ||||||
| EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN | EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN | ||||||
| DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT | DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT | ||||||
| FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY | FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY | ||||||
| GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH | GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH | ||||||
							
								
								
									
										12
									
								
								data/3FG7.fa
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								data/3FG7.fa
									
									
									
									
									
								
							| @@ -1,6 +1,6 @@ | |||||||
| >3FG7:A|PDBID|CHAIN|SEQUENCE | >3FG7:A|PDBID|CHAIN|SEQUENCE | ||||||
| MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK | MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK | ||||||
| WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM | WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM | ||||||
| VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT | VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT | ||||||
| ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD | ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD | ||||||
| QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN | QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN | ||||||
|   | |||||||
| @@ -1,20 +1,20 @@ | |||||||
| [ | [ | ||||||
|   { "name" : "MBP1_SACCE", |   { "name" : "MBP1_SACCE", | ||||||
|     "RefSeqID" : "NP_010227", |     "RefSeqID" : "NP_010227", | ||||||
|     "UniProtID" : "P39678", |     "UniProtID" : "P39678", | ||||||
|     "taxonomyID" : 559292, |     "taxonomyID" : 559292, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF", |        "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF", | ||||||
|        "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET", |        "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET", | ||||||
|        "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL", |        "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL", | ||||||
|        "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ", |        "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ", | ||||||
|        "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV", |        "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV", | ||||||
|        "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", |        "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS", | ||||||
|        "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL", |        "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL", | ||||||
|        "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM", |        "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM", | ||||||
|        "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ", |        "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ", | ||||||
|        "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK", |        "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK", | ||||||
|        "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS", |        "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS", | ||||||
|        "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"] |        "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"] | ||||||
|   } |   } | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -1,30 +1,30 @@ | |||||||
| >PTPN5-201 cds:protein_coding (ENST00000358540.7) | >PTPN5-201 cds:protein_coding (ENST00000358540.7) | ||||||
| ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA | ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA | ||||||
| GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG | GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG | ||||||
| GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT | GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT | ||||||
| CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC | CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC | ||||||
| TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT | TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT | ||||||
| GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC | GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC | ||||||
| TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG | TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG | ||||||
| TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC | TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC | ||||||
| CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC | CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC | ||||||
| AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG | AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG | ||||||
| ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG | ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG | ||||||
| ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG | ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG | ||||||
| GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG | GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG | ||||||
| GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC | GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC | ||||||
| GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG | GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG | ||||||
| CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT | CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT | ||||||
| GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT | GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT | ||||||
| CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG | CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG | ||||||
| GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC | GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC | ||||||
| GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC | GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC | ||||||
| ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC | ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC | ||||||
| GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC | GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC | ||||||
| TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC | TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC | ||||||
| GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG | GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG | ||||||
| GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT | GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT | ||||||
| GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT | GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT | ||||||
| GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG | GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG | ||||||
| ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC | ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC | ||||||
| CACCAGTCCCCAGAATGA | CACCAGTCCCCAGAATGA | ||||||
|   | |||||||
| @@ -1,12 +1,12 @@ | |||||||
| >RAB39B cds:protein_coding (ENST00000369454.4) | >RAB39B cds:protein_coding (ENST00000369454.4) | ||||||
| ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC | ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC | ||||||
| AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC | AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC | ||||||
| GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC | GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC | ||||||
| CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG | CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG | ||||||
| AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC | AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC | ||||||
| CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG | CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG | ||||||
| GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA | GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA | ||||||
| CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG | CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG | ||||||
| GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT | GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT | ||||||
| ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT | ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT | ||||||
| TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG | TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG | ||||||
|   | |||||||
| @@ -1,131 +1,131 @@ | |||||||
|  |  | ||||||
|  |  | ||||||
| ```{css, echo = FALSE} | ```{css, echo = FALSE} | ||||||
|  |  | ||||||
| .striped tr:nth-child(even) { | .striped tr:nth-child(even) { | ||||||
|   background: #eaf1ff; |   background: #eaf1ff; | ||||||
| } | } | ||||||
| .striped { | .striped { | ||||||
|   padding: 5px; |   padding: 5px; | ||||||
| } | } | ||||||
| ``` | ``` | ||||||
| <small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 --> | <small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 --> | ||||||
|  |  | ||||||
|  |  | ||||||
| ```{r setup, include=FALSE} | ```{r setup, include=FALSE} | ||||||
| knitr::opts_chunk$set(echo = TRUE) | knitr::opts_chunk$set(echo = TRUE) | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| ## Phobias! ## | ## Phobias! ## | ||||||
| We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>. | We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>. | ||||||
|  |  | ||||||
| To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it. | To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it. | ||||||
| ```{r packages} | ```{r packages} | ||||||
| if (! requireNamespace("rvest", quietly=TRUE)) { | if (! requireNamespace("rvest", quietly=TRUE)) { | ||||||
|   install.packages("rvest") |   install.packages("rvest") | ||||||
| } | } | ||||||
| if (! requireNamespace("xml2", quietly=TRUE)) { | if (! requireNamespace("xml2", quietly=TRUE)) { | ||||||
|   install.packages("xml2") |   install.packages("xml2") | ||||||
| } | } | ||||||
| ``` | ``` | ||||||
| As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable. | As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable. | ||||||
|  |  | ||||||
| `xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames. | `xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames. | ||||||
|  |  | ||||||
| ```{r getPageData, cache=TRUE} | ```{r getPageData, cache=TRUE} | ||||||
| webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias") | webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias") | ||||||
| allTables <- rvest::html_table(webPage, fill = TRUE) | allTables <- rvest::html_table(webPage, fill = TRUE) | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`. | There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`. | ||||||
|  |  | ||||||
| ```{r collateTables, cache=TRUE} | ```{r collateTables, cache=TRUE} | ||||||
| phobiaTable <- data.frame(Phobia = character(), Condition = character()) | phobiaTable <- data.frame(Phobia = character(), Condition = character()) | ||||||
| for (i in seq_along(allTables)) { | for (i in seq_along(allTables)) { | ||||||
|   df <- allTables[[i]] |   df <- allTables[[i]] | ||||||
|   if (all(colnames(df) == c("Phobia", "Condition"))) { |   if (all(colnames(df) == c("Phobia", "Condition"))) { | ||||||
|     phobiaTable <- rbind(phobiaTable, df) |     phobiaTable <- rbind(phobiaTable, df) | ||||||
|   } |   } | ||||||
| } | } | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them. | Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them. | ||||||
|  |  | ||||||
| <p>  | <p>  | ||||||
| <p> | <p> | ||||||
|  |  | ||||||
| ```{r , ref.label="randRow", echo=FALSE} | ```{r , ref.label="randRow", echo=FALSE} | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| **Table**: seven random phobias<br/> | **Table**: seven random phobias<br/> | ||||||
| ```{r renderPhobiaTable, echo=FALSE, results='asis'} | ```{r renderPhobiaTable, echo=FALSE, results='asis'} | ||||||
| sel <- sample(1:nrow(phobiaTable), 7) | sel <- sample(1:nrow(phobiaTable), 7) | ||||||
| knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html") | knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html") | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| <p>  | <p>  | ||||||
| <p> | <p> | ||||||
| To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function. | To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function. | ||||||
|  |  | ||||||
| ```{r randRow} | ```{r randRow} | ||||||
| randRow <- function(M, seed = FALSE) { | randRow <- function(M, seed = FALSE) { | ||||||
|   # Return a random row from a dataframe M. |   # Return a random row from a dataframe M. | ||||||
|   if (seed) { |   if (seed) { | ||||||
|     oldseed <- .Random.seed                # play nice and save the RNG state ... |     oldseed <- .Random.seed                # play nice and save the RNG state ... | ||||||
|     set.seed(as.integer(seed)) |     set.seed(as.integer(seed)) | ||||||
|   } |   } | ||||||
|   r <- M[sample(1:nrow(M), 1), ]           # fetch one random row |   r <- M[sample(1:nrow(M), 1), ]           # fetch one random row | ||||||
|   if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state |   if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state | ||||||
|   return(r) |   return(r) | ||||||
| } | } | ||||||
| ``` | ``` | ||||||
| <p>  | <p>  | ||||||
| <p> | <p> | ||||||
| With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`. | With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`. | ||||||
|  |  | ||||||
| _`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful. | _`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful. | ||||||
|  |  | ||||||
| <p>  | <p>  | ||||||
| <p> | <p> | ||||||
|  |  | ||||||
| Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up. | Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up. | ||||||
|  |  | ||||||
| ```{r preProcess} | ```{r preProcess} | ||||||
|  |  | ||||||
| # select only single-word phobias that end with "phobia" | # select only single-word phobias that end with "phobia" | ||||||
| sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia) | sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia) | ||||||
| names <- phobiaTable$Phobia[sel] | names <- phobiaTable$Phobia[sel] | ||||||
|  |  | ||||||
| # extract the ones we did _not_ select | # extract the ones we did _not_ select | ||||||
| x <- phobiaTable$Phobia[! sel] | x <- phobiaTable$Phobia[! sel] | ||||||
| # use strsplit() to split them apart and flatten the resulting list | # use strsplit() to split them apart and flatten the resulting list | ||||||
| x <- unlist(strsplit(x, ", ")) | x <- unlist(strsplit(x, ", ")) | ||||||
| x <- unlist(strsplit(x, " ")) | x <- unlist(strsplit(x, " ")) | ||||||
| x <- unlist(strsplit(x, "/")) | x <- unlist(strsplit(x, "/")) | ||||||
| # use the same selection as above, and append the result to our "names"" | # use the same selection as above, and append the result to our "names"" | ||||||
| sel <- ! grepl(" ", x) & grepl(".phobia$", x) | sel <- ! grepl(" ", x) & grepl(".phobia$", x) | ||||||
| names <- c(names, x[sel]) | names <- c(names, x[sel]) | ||||||
|  |  | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths. | Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths. | ||||||
|  |  | ||||||
| ```{r showHist} | ```{r showHist} | ||||||
|  |  | ||||||
| x <- nchar(names) | x <- nchar(names) | ||||||
| pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ... | pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ... | ||||||
| pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too. | pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too. | ||||||
| hist(x, | hist(x, | ||||||
|      main = "Length of phobia-names", |      main = "Length of phobia-names", | ||||||
|      sub = sprintf("Shortest: %s (%d), Longest: %s (%d)", |      sub = sprintf("Shortest: %s (%d), Longest: %s (%d)", | ||||||
|                    pShort, nchar(pShort), pLong, nchar(pLong)), |                    pShort, nchar(pShort), pLong, nchar(pLong)), | ||||||
|      cex.sub = 0.8, |      cex.sub = 0.8, | ||||||
|      xlab = "name", |      xlab = "name", | ||||||
|      ylab = "counts", |      ylab = "counts", | ||||||
|      col ="#aef5ee") |      col ="#aef5ee") | ||||||
|  |  | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| That's all. | That's all. | ||||||
|  |  | ||||||
| <!-- [END] --> | <!-- [END] --> | ||||||
|   | |||||||
| @@ -1,43 +1,43 @@ | |||||||
| >MBP1 YDL056W SGDID:S000002214 | >MBP1 YDL056W SGDID:S000002214 | ||||||
| ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT | ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT | ||||||
| TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA | TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA | ||||||
| AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG | AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG | ||||||
| GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG | GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG | ||||||
| AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC | AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC | ||||||
| GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC | GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC | ||||||
| TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA | TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA | ||||||
| AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT | AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT | ||||||
| CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG | CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG | ||||||
| AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA | AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA | ||||||
| CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA | CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA | ||||||
| TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA | TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA | ||||||
| CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG | CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG | ||||||
| GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA | GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA | ||||||
| CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT | CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT | ||||||
| CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT | CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT | ||||||
| CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG | CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG | ||||||
| CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT | CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT | ||||||
| TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT | TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT | ||||||
| CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT | CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT | ||||||
| TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT | TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT | ||||||
| ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT | ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT | ||||||
| TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT | TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT | ||||||
| ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT | ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT | ||||||
| TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT | TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT | ||||||
| AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT | AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT | ||||||
| TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT | TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT | ||||||
| ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG | ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG | ||||||
| ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC | ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC | ||||||
| GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT | GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT | ||||||
| GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT | GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT | ||||||
| ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA | ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA | ||||||
| CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA | CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA | ||||||
| ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC | ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC | ||||||
| GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG | GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG | ||||||
| AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG | AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG | ||||||
| CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA | CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA | ||||||
| GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG | GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG | ||||||
| CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG | CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG | ||||||
| ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT | ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT | ||||||
| AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA | AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA | ||||||
| GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA | GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA | ||||||
| @@ -1,47 +1,47 @@ | |||||||
| SGD_features.tab | SGD_features.tab | ||||||
|  |  | ||||||
| The latest version of the SGD_features.tab file is based on Genome Version R64-2-1. | The latest version of the SGD_features.tab file is based on Genome Version R64-2-1. | ||||||
|  |  | ||||||
| The SGD_features.tab file is updated weekly (Saturday). | The SGD_features.tab file is updated weekly (Saturday). | ||||||
|  |  | ||||||
| NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously | NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously | ||||||
| used chromosomal_feature.tab file. | used chromosomal_feature.tab file. | ||||||
|  |  | ||||||
| File contents: | File contents: | ||||||
|  |  | ||||||
| 1. Information on current chromosomal features in SGD, including Dubious ORFs.  | 1. Information on current chromosomal features in SGD, including Dubious ORFs.  | ||||||
| Also contains coordinates of intron, exons, and other subfeatures that are located | Also contains coordinates of intron, exons, and other subfeatures that are located | ||||||
| within a chromosomal feature. | within a chromosomal feature. | ||||||
|  |  | ||||||
| 2. The relationship between subfeatures and the feature in which they | 2. The relationship between subfeatures and the feature in which they | ||||||
| are located is identified by the feature name in column #7 (parent | are located is identified by the feature name in column #7 (parent | ||||||
| feature). For example, the parent feature of the intron found in | feature). For example, the parent feature of the intron found in | ||||||
| ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is | ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is | ||||||
| chromosome 6. | chromosome 6. | ||||||
|  |  | ||||||
| 3. The coordinates of all features are in chromosomal coordinates. | 3. The coordinates of all features are in chromosomal coordinates. | ||||||
|  |  | ||||||
|  |  | ||||||
| Columns within SGD_features.tab: | Columns within SGD_features.tab: | ||||||
|  |  | ||||||
| 1.   Primary SGDID (mandatory) | 1.   Primary SGDID (mandatory) | ||||||
| 2.   Feature type (mandatory) | 2.   Feature type (mandatory) | ||||||
| 3.   Feature qualifier (optional) | 3.   Feature qualifier (optional) | ||||||
| 4.   Feature name (optional) | 4.   Feature name (optional) | ||||||
| 5.   Standard gene name (optional) | 5.   Standard gene name (optional) | ||||||
| 6.   Alias (optional, multiples separated by |) | 6.   Alias (optional, multiples separated by |) | ||||||
| 7.   Parent feature name (optional) | 7.   Parent feature name (optional) | ||||||
| 8.   Secondary SGDID (optional, multiples separated by |) | 8.   Secondary SGDID (optional, multiples separated by |) | ||||||
| 9.   Chromosome (optional) | 9.   Chromosome (optional) | ||||||
| 10.  Start_coordinate (optional) | 10.  Start_coordinate (optional) | ||||||
| 11.  Stop_coordinate (optional) | 11.  Stop_coordinate (optional) | ||||||
| 12.  Strand (optional) | 12.  Strand (optional) | ||||||
| 13.  Genetic position (optional) | 13.  Genetic position (optional) | ||||||
| 14.  Coordinate version (optional) | 14.  Coordinate version (optional) | ||||||
| 15.  Sequence version (optional) | 15.  Sequence version (optional) | ||||||
| 16.  Description (optional) | 16.  Description (optional) | ||||||
|  |  | ||||||
| Note that "chromosome 17" is the mitochondrial chromosome. | Note that "chromosome 17" is the mitochondrial chromosome. | ||||||
|  |  | ||||||
| The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff | The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										32908
									
								
								data/SGD_features.tab
									
									
									
									
									
								
							
							
						
						
									
										32908
									
								
								data/SGD_features.tab
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										2030
									
								
								data/Species.csv
									
									
									
									
									
								
							
							
						
						
									
										2030
									
								
								data/Species.csv
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,179 +1,179 @@ | |||||||
| MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | ||||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936 | 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936 | ||||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334 | 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334 | ||||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078 | 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078 | ||||||
| 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131 | 93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131 | ||||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936 | 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936 | ||||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334 | 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334 | ||||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131 | 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131 | ||||||
| 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078 | 86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078 | ||||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131 | 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131 | ||||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078 | 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078 | ||||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334 | 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334 | ||||||
| 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936 | 72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936 | ||||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334 | 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334 | ||||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131 | 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131 | ||||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078 | 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078 | ||||||
| 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936 | 63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936 | ||||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936 | 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936 | ||||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078 | 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078 | ||||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131 | 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131 | ||||||
| 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334 | 36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334 | ||||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078 | 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078 | ||||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936 | 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936 | ||||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334 | 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334 | ||||||
| 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131 | 24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131 | ||||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131 | 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131 | ||||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936 | 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936 | ||||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334 | 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334 | ||||||
| 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078 | 23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078 | ||||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131 | 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131 | ||||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936 | 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936 | ||||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334 | 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334 | ||||||
| 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078 | 16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078 | ||||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936 | 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936 | ||||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131 | 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131 | ||||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334 | 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334 | ||||||
| 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078 | 13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078 | ||||||
| 11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936 | 11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936 | ||||||
| 11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078 | 11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078 | ||||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334 | 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334 | ||||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936 | 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936 | ||||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131 | 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131 | ||||||
| 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078 | 10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078 | ||||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334 | 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334 | ||||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131 | 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131 | ||||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936 | 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936 | ||||||
| 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078 | 9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078 | ||||||
| 7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078 | 7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078 | ||||||
| 7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078 | 7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078 | ||||||
| 7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936 | 7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936 | ||||||
| 7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936 | 7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936 | ||||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936 | 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936 | ||||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078 | 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078 | ||||||
| 5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936 | 5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936 | ||||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334 | 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334 | ||||||
| 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131 | 5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131 | ||||||
| 5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078 | 5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078 | ||||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078 | 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078 | ||||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334 | 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334 | ||||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936 | 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936 | ||||||
| 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131 | 4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131 | ||||||
| 3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078 | 3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078 | ||||||
| 3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078 | 3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078 | ||||||
| 3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078 | 3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078 | ||||||
| 3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936 | 3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936 | ||||||
| 3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936 | 3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936 | ||||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078 | 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078 | ||||||
| 3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078 | 3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078 | ||||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078 | 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078 | ||||||
| 3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936 | 3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936 | ||||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936 | 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936 | ||||||
| 3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936 | 3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936 | ||||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936 | 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936 | ||||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131 | 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131 | ||||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334 | 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334 | ||||||
| 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334 | 3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334 | ||||||
| 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131 | 3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131 | ||||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131 | 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131 | ||||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936 | 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936 | ||||||
| 2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936 | 2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936 | ||||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334 | 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334 | ||||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131 | 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131 | ||||||
| 2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936 | 2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936 | ||||||
| 2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078 | 2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078 | ||||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078 | 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078 | ||||||
| 2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078 | 2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078 | ||||||
| 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936 | 2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936 | ||||||
| 2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936 | 2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936 | ||||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078 | 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078 | ||||||
| 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334 | 2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334 | ||||||
| 2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078 | 2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334 | 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334 | ||||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334 | 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334 | ||||||
| 1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334 | 1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334 | ||||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334 | 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334 | ||||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334 | 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334 | ||||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334 | 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334 | ||||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334 | 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334 | ||||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334 | 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334 | ||||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334 | 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334 | ||||||
| 0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936 | 0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936 | ||||||
| 1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078 | 1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078 | ||||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078 | 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078 | ||||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078 | 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936 | ||||||
| 0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936 | 0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078 | ||||||
| 0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078 | 0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078 | ||||||
| 0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078 | 0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078 | ||||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078 | 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078 | ||||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078 | 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078 | ||||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078 | 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936 | ||||||
| 0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936 | 0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131 | 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131 | ||||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131 | 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131 | ||||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131 | 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131 | ||||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131 | 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131 | ||||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131 | 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131 | ||||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131 | 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131 | ||||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131 | 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131 | ||||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131 | 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131 | ||||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131 | 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131 | ||||||
| 1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334 | 1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334 | ||||||
| 0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334 | 0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334 | ||||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334 | 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334 | ||||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334 | 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334 | ||||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334 | 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334 | ||||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334 | 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334 | ||||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131 | 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131 | ||||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131 | 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131 | ||||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131 | 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131 | ||||||
| 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936 | ||||||
| 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936 | 0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936 | ||||||
| 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936 | 0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936 | ||||||
| 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936 | 0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936 | ||||||
| 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936 | 1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936 | 1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936 | ||||||
| 1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936 | 1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936 | ||||||
| 1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936 | 1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936 | ||||||
| 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936 | 1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936 | ||||||
| 1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078 | ||||||
| 1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078 | 1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078 | ||||||
|   | |||||||
| 
 | 
| @@ -1,49 +1,49 @@ | |||||||
| MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | ||||||
| 2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094 | 2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094 | ||||||
| 2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094 | 2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094 | 1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094 | 1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094 | 1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094 | ||||||
| 0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094 | 0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094 | ||||||
| 1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094 | 1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094 | ||||||
|   | |||||||
| 
 | 
| @@ -1,113 +1,113 @@ | |||||||
| MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT | ||||||
| 5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677 | 5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677 | ||||||
| 4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677 | 4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677 | ||||||
| 3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597 | 3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597 | ||||||
| 3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677 | 3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677 | ||||||
| 2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677 | 2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677 | ||||||
| 2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677 | 2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677 | ||||||
| 2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677 | 2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677 | ||||||
| 2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597 | 2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597 | ||||||
| 2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597 | 2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597 | ||||||
| 2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597 | 2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597 | ||||||
| 2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597 | 2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597 | ||||||
| 2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677 | 2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677 | ||||||
| 2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677 | 2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677 | ||||||
| 2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677 | 2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677 | ||||||
| 2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677 | 2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818 | 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818 | ||||||
| 1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597 | ||||||
| 0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597 | 0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597 | 1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597 | 1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818 | 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818 | ||||||
| 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818 | 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818 | ||||||
| 1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818 | 1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818 | ||||||
| 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818 | 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818 | ||||||
| 1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597 | ||||||
| 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597 | 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597 | ||||||
| 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597 | 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597 | ||||||
| 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597 | 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597 | ||||||
| 0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597 | 0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597 | ||||||
| 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597 | 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597 | 1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597 | ||||||
| 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818 | 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818 | ||||||
| 1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597 | ||||||
| 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677 | 1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597 | 0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597 | 1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597 | ||||||
| 1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677 | ||||||
| 0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677 | 0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677 | 1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677 | 1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677 | ||||||
| 1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677 | 1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677 | ||||||
|   | |||||||
| 
 | 
| @@ -1,39 +1,39 @@ | |||||||
| >MBP1_ASPNI AN3154 XP_660758 Q5B8H6 | >MBP1_ASPNI AN3154 XP_660758 Q5B8H6 | ||||||
| -VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI | -VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI | ||||||
| LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY | LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY | ||||||
|  |  | ||||||
| >MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86 | >MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86 | ||||||
| KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI | KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI | ||||||
| LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY | LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY | ||||||
|  |  | ||||||
| >MBP1_COPCI  - XP_001837394 A8NYC6 | >MBP1_COPCI  - XP_001837394 A8NYC6 | ||||||
| QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV | QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV | ||||||
| LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF | LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF | ||||||
|  |  | ||||||
| >MBP1_CRYNE  - XP_569090 Q5KMQ9 | >MBP1_CRYNE  - XP_569090 Q5KMQ9 | ||||||
| DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA | DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA | ||||||
| LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV | LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV | ||||||
|  |  | ||||||
| >MBP1_NEUCR Swi4 XP_955821 Q7RW59 | >MBP1_NEUCR Swi4 XP_955821 Q7RW59 | ||||||
| -IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI | -IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI | ||||||
| LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF | LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF | ||||||
|  |  | ||||||
| >MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4 | >MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4 | ||||||
| -IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV | -IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV | ||||||
| LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF | LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF | ||||||
|  |  | ||||||
| >MBP1_SACCE Mbp1 NP_010227 P39678 | >MBP1_SACCE Mbp1 NP_010227 P39678 | ||||||
| QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI | QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI | ||||||
| LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF | LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF | ||||||
|  |  | ||||||
| >MBP1_SCHPO Res2 NP_593032 P41412 | >MBP1_SCHPO Res2 NP_593032 P41412 | ||||||
| -VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV | -VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV | ||||||
| LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS- | LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS- | ||||||
|  |  | ||||||
| >MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35 | >MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35 | ||||||
| -IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV | -IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV | ||||||
| LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY | LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY | ||||||
|  |  | ||||||
| >MBP1_WALME  - XP_006957051 I4YGC0 | >MBP1_WALME  - XP_006957051 I4YGC0 | ||||||
| -IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI | -IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI | ||||||
| LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY | LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY | ||||||
|   | |||||||
| @@ -1,490 +1,490 @@ | |||||||
| [ | [ | ||||||
|   { "name" : "68476_WALME", |   { "name" : "68476_WALME", | ||||||
|     "RefSeqID" : "XP_006957790", |     "RefSeqID" : "XP_006957790", | ||||||
|     "UniProtID" : "I4YDD8", |     "UniProtID" : "I4YDD8", | ||||||
|     "taxonomyID" : "671144", |     "taxonomyID" : "671144", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ", |              "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ", | ||||||
|              "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER", |              "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER", | ||||||
|              "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG", |              "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG", | ||||||
|              "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE", |              "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE", | ||||||
|              "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA", |              "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA", | ||||||
|              "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD", |              "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD", | ||||||
|              "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE", |              "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE", | ||||||
|              "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE", |              "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE", | ||||||
|              "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"] |              "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "00846_COPCI", |   { "name" : "00846_COPCI", | ||||||
|     "RefSeqID" : "XP_001831299", |     "RefSeqID" : "XP_001831299", | ||||||
|     "UniProtID" : "A8N8X1", |     "UniProtID" : "A8N8X1", | ||||||
|     "taxonomyID" : "240176", |     "taxonomyID" : "240176", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG", |              "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG", | ||||||
|              "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS", |              "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS", | ||||||
|              "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH", |              "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH", | ||||||
|              "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP", |              "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP", | ||||||
|              "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF", |              "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF", | ||||||
|              "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK", |              "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK", | ||||||
|              "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN", |              "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN", | ||||||
|              "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT", |              "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT", | ||||||
|              "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS", |              "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS", | ||||||
|              "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE", |              "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE", | ||||||
|              "SEAQVVDIGRVSGFMQKVRDGII"] |              "SEAQVVDIGRVSGFMQKVRDGII"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "8533_BIPOR", |   { "name" : "8533_BIPOR", | ||||||
|     "RefSeqID" : "XP_007691662", |     "RefSeqID" : "XP_007691662", | ||||||
|     "UniProtID" : "W6ZE71", |     "UniProtID" : "W6ZE71", | ||||||
|     "taxonomyID" : "930090", |     "taxonomyID" : "930090", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR", |              "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR", | ||||||
|              "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS", |              "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS", | ||||||
|              "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT", |              "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT", | ||||||
|              "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT", |              "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT", | ||||||
|              "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT", |              "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT", | ||||||
|              "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA", |              "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA", | ||||||
|              "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME", |              "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME", | ||||||
|              "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID", |              "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID", | ||||||
|              "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE", |              "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE", | ||||||
|              "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML", |              "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML", | ||||||
|              "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"] |              "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "PGTG_02039", |   { "name" : "PGTG_02039", | ||||||
|     "RefSeqID" : "XP_003320997", |     "RefSeqID" : "XP_003320997", | ||||||
|     "UniProtID" : "E3JX03", |     "UniProtID" : "E3JX03", | ||||||
|     "taxonomyID" : "418459", |     "taxonomyID" : "418459", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV", |              "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV", | ||||||
|              "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS", |              "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS", | ||||||
|              "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP", |              "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP", | ||||||
|              "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP", |              "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP", | ||||||
|              "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL", |              "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL", | ||||||
|              "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH", |              "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH", | ||||||
|              "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI", |              "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI", | ||||||
|              "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN", |              "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN", | ||||||
|              "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC", |              "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC", | ||||||
|              "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK", |              "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK", | ||||||
|              "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"] |              "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "MBPA_ASPNI", |   { "name" : "MBPA_ASPNI", | ||||||
|     "RefSeqID" : "XP_664319", |     "RefSeqID" : "XP_664319", | ||||||
|     "UniProtID" : "Q5AYB5", |     "UniProtID" : "Q5AYB5", | ||||||
|     "taxonomyID" : "227321", |     "taxonomyID" : "227321", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG", |              "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG", | ||||||
|              "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK", |              "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK", | ||||||
|              "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP", |              "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP", | ||||||
|              "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG", |              "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG", | ||||||
|              "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER", |              "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER", | ||||||
|              "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK", |              "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK", | ||||||
|              "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS", |              "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS", | ||||||
|              "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA", |              "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA", | ||||||
|              "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS", |              "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS", | ||||||
|              "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP", |              "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP", | ||||||
|              "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL", |              "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL", | ||||||
|              "MRDRGQDW"] |              "MRDRGQDW"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "05520_CRYNE", |   { "name" : "05520_CRYNE", | ||||||
|     "RefSeqID" : "XP_570545", |     "RefSeqID" : "XP_570545", | ||||||
|     "UniProtID" : "Q5KHS0", |     "UniProtID" : "Q5KHS0", | ||||||
|     "taxonomyID" : "214684", |     "taxonomyID" : "214684", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH", |              "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH", | ||||||
|              "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART", |              "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART", | ||||||
|              "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL", |              "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL", | ||||||
|              "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM", |              "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM", | ||||||
|              "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL", |              "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL", | ||||||
|              "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS", |              "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS", | ||||||
|              "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG", |              "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG", | ||||||
|              "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD", |              "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD", | ||||||
|              "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT", |              "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT", | ||||||
|              "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP", |              "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP", | ||||||
|              "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS", |              "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS", | ||||||
|              "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"] |              "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "RES1_SCHPO", |   { "name" : "RES1_SCHPO", | ||||||
|     "RefSeqID" : "NP_595496", |     "RefSeqID" : "NP_595496", | ||||||
|     "UniProtID" : "P33520", |     "UniProtID" : "P33520", | ||||||
|     "taxonomyID" : "284812", |     "taxonomyID" : "284812", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP", |              "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP", | ||||||
|              "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS", |              "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS", | ||||||
|              "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA", |              "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA", | ||||||
|              "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS", |              "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS", | ||||||
|              "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS", |              "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS", | ||||||
|              "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL", |              "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL", | ||||||
|              "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA", |              "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA", | ||||||
|              "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"] |              "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "CDC10_SCHPO", |   { "name" : "CDC10_SCHPO", | ||||||
|     "RefSeqID" : "NP_596132", |     "RefSeqID" : "NP_596132", | ||||||
|     "UniProtID" : "P01129", |     "UniProtID" : "P01129", | ||||||
|     "taxonomyID" : "284812", |     "taxonomyID" : "284812", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL", |              "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL", | ||||||
|              "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP", |              "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP", | ||||||
|              "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ", |              "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ", | ||||||
|              "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS", |              "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS", | ||||||
|              "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV", |              "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV", | ||||||
|              "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ", |              "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ", | ||||||
|              "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR", |              "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR", | ||||||
|              "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ", |              "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ", | ||||||
|              "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS", |              "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS", | ||||||
|              "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"] |              "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "05338_USTMA", |   { "name" : "05338_USTMA", | ||||||
|     "RefSeqID" : "XP_011392041", |     "RefSeqID" : "XP_011392041", | ||||||
|     "UniProtID" : "A0A0D1BWD8", |     "UniProtID" : "A0A0D1BWD8", | ||||||
|     "taxonomyID" : "237631", |     "taxonomyID" : "237631", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT", |              "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT", | ||||||
|              "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI", |              "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI", | ||||||
|              "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR", |              "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR", | ||||||
|              "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL", |              "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL", | ||||||
|              "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA", |              "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA", | ||||||
|              "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI", |              "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI", | ||||||
|              "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA", |              "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA", | ||||||
|              "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT", |              "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT", | ||||||
|              "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL", |              "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL", | ||||||
|              "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL", |              "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL", | ||||||
|              "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN", |              "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN", | ||||||
|              "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG", |              "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG", | ||||||
|              "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP", |              "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP", | ||||||
|              "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"] |              "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "SWI4_SACCE", |   { "name" : "SWI4_SACCE", | ||||||
|     "RefSeqID" : "NP_011036", |     "RefSeqID" : "NP_011036", | ||||||
|     "UniProtID" : "P25302", |     "UniProtID" : "P25302", | ||||||
|     "taxonomyID" : "559292", |     "taxonomyID" : "559292", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF", |              "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF", | ||||||
|              "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP", |              "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP", | ||||||
|              "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN", |              "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN", | ||||||
|              "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS", |              "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS", | ||||||
|              "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK", |              "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK", | ||||||
|              "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY", |              "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY", | ||||||
|              "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK", |              "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK", | ||||||
|              "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN", |              "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN", | ||||||
|              "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ", |              "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ", | ||||||
|              "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS", |              "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS", | ||||||
|              "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV", |              "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV", | ||||||
|              "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL", |              "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL", | ||||||
|              "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL", |              "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL", | ||||||
|              "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"] |              "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "SWI6_NEUCR", |   { "name" : "SWI6_NEUCR", | ||||||
|     "RefSeqID" : "XP_962967", |     "RefSeqID" : "XP_962967", | ||||||
|     "UniProtID" : "Q7SBG9", |     "UniProtID" : "Q7SBG9", | ||||||
|     "taxonomyID" : "367110", |     "taxonomyID" : "367110", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN", |              "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN", | ||||||
|              "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT", |              "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT", | ||||||
|              "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF", |              "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF", | ||||||
|              "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT", |              "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT", | ||||||
|              "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA", |              "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA", | ||||||
|              "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV", |              "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV", | ||||||
|              "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG", |              "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG", | ||||||
|              "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL", |              "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL", | ||||||
|              "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT", |              "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT", | ||||||
|              "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES", |              "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES", | ||||||
|              "EKPELEIARVRRFLGGVEGVVH"] |              "EKPELEIARVRRFLGGVEGVVH"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "15042_USTMA", |   { "name" : "15042_USTMA", | ||||||
|     "RefSeqID" : "XP_011388143", |     "RefSeqID" : "XP_011388143", | ||||||
|     "UniProtID" : "A0A0D1CVS5", |     "UniProtID" : "A0A0D1CVS5", | ||||||
|     "taxonomyID" : "237631", |     "taxonomyID" : "237631", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA", |              "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA", | ||||||
|              "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN", |              "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN", | ||||||
|              "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE", |              "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE", | ||||||
|              "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI", |              "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI", | ||||||
|              "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS", |              "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS", | ||||||
|              "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT", |              "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT", | ||||||
|              "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR", |              "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR", | ||||||
|              "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"] |              "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "04778_USTMA", |   { "name" : "04778_USTMA", | ||||||
|     "RefSeqID" : "XP_011391646", |     "RefSeqID" : "XP_011391646", | ||||||
|     "UniProtID" : "A0A0D1DQM4", |     "UniProtID" : "A0A0D1DQM4", | ||||||
|     "taxonomyID" : "237631", |     "taxonomyID" : "237631", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK", |              "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK", | ||||||
|              "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS", |              "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS", | ||||||
|              "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG", |              "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG", | ||||||
|              "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG", |              "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG", | ||||||
|              "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD", |              "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD", | ||||||
|              "PESAQLFTIHDFGSDPFYAEQVERG"] |              "PESAQLFTIHDFGSDPFYAEQVERG"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "STUA_ASPNI", |   { "name" : "STUA_ASPNI", | ||||||
|     "RefSeqID" : "XP_663440", |     "RefSeqID" : "XP_663440", | ||||||
|     "UniProtID" : "P36011", |     "UniProtID" : "P36011", | ||||||
|     "taxonomyID" : "227321", |     "taxonomyID" : "227321", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS", |              "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS", | ||||||
|              "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM", |              "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM", | ||||||
|              "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ", |              "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ", | ||||||
|              "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL", |              "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL", | ||||||
|              "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR", |              "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR", | ||||||
|              "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP", |              "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP", | ||||||
|              "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD", |              "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD", | ||||||
|              "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"] |              "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "STUA_NEUCR", |   { "name" : "STUA_NEUCR", | ||||||
|     "RefSeqID" : "XP_960837", |     "RefSeqID" : "XP_960837", | ||||||
|     "UniProtID" : "Q1K6U0", |     "UniProtID" : "Q1K6U0", | ||||||
|     "taxonomyID" : "367110", |     "taxonomyID" : "367110", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG", |              "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG", | ||||||
|              "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT", |              "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT", | ||||||
|              "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK", |              "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK", | ||||||
|              "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG", |              "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG", | ||||||
|              "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ", |              "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ", | ||||||
|              "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV", |              "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV", | ||||||
|              "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM", |              "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM", | ||||||
|              "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH", |              "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH", | ||||||
|              "RRR"] |              "RRR"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "PHD1_SACCE", |   { "name" : "PHD1_SACCE", | ||||||
|     "RefSeqID" : "NP_012881", |     "RefSeqID" : "NP_012881", | ||||||
|     "UniProtID" : "P36093", |     "UniProtID" : "P36093", | ||||||
|     "taxonomyID" : "559292", |     "taxonomyID" : "559292", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA", |              "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA", | ||||||
|              "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS", |              "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS", | ||||||
|              "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS", |              "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS", | ||||||
|              "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI", |              "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI", | ||||||
|              "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"] |              "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "08099_COPCI", |   { "name" : "08099_COPCI", | ||||||
|     "RefSeqID" : "XP_001836714", |     "RefSeqID" : "XP_001836714", | ||||||
|     "UniProtID" : "A8NVH3", |     "UniProtID" : "A8NVH3", | ||||||
|     "taxonomyID" : "240176", |     "taxonomyID" : "240176", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS", |              "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS", | ||||||
|              "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS", |              "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS", | ||||||
|              "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS", |              "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS", | ||||||
|              "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV", |              "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV", | ||||||
|              "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ", |              "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ", | ||||||
|              "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA", |              "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA", | ||||||
|              "PHRPW"] |              "PHRPW"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "68479_WALME", |   { "name" : "68479_WALME", | ||||||
|     "RefSeqID" : "XP_006957792", |     "RefSeqID" : "XP_006957792", | ||||||
|     "UniProtID" : "I4YDE0", |     "UniProtID" : "I4YDE0", | ||||||
|     "taxonomyID" : "671144", |     "taxonomyID" : "671144", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF", |              "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF", | ||||||
|              "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS", |              "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS", | ||||||
|              "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS", |              "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS", | ||||||
|              "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"] |              "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "11943_PUCGR", |   { "name" : "11943_PUCGR", | ||||||
|     "RefSeqID" : "XP_003330006", |     "RefSeqID" : "XP_003330006", | ||||||
|     "UniProtID" : "E3KMR2", |     "UniProtID" : "E3KMR2", | ||||||
|     "taxonomyID" : "418459", |     "taxonomyID" : "418459", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV", |              "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV", | ||||||
|              "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP", |              "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP", | ||||||
|              "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA", |              "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA", | ||||||
|              "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS", |              "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS", | ||||||
|              "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN", |              "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN", | ||||||
|              "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA", |              "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA", | ||||||
|              "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL", |              "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL", | ||||||
|              "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"] |              "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "03082_PUCGR", |   { "name" : "03082_PUCGR", | ||||||
|     "RefSeqID" : "XP_003321545", |     "RefSeqID" : "XP_003321545", | ||||||
|     "UniProtID" : "E3JYK1", |     "UniProtID" : "E3JYK1", | ||||||
|     "taxonomyID" : "418459", |     "taxonomyID" : "418459", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV", |              "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV", | ||||||
|              "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP", |              "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP", | ||||||
|              "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL", |              "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL", | ||||||
|              "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK", |              "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK", | ||||||
|              "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP", |              "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP", | ||||||
|              "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA", |              "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA", | ||||||
|              "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD", |              "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD", | ||||||
|              "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR", |              "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR", | ||||||
|              "LIMEWNPSC"] |              "LIMEWNPSC"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "SOK2_SACCE", |   { "name" : "SOK2_SACCE", | ||||||
|     "RefSeqID" : "NP_013729", |     "RefSeqID" : "NP_013729", | ||||||
|     "UniProtID" : "P53438", |     "UniProtID" : "P53438", | ||||||
|     "taxonomyID" : "559292", |     "taxonomyID" : "559292", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS", |              "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS", | ||||||
|              "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS", |              "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS", | ||||||
|              "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP", |              "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP", | ||||||
|              "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY", |              "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY", | ||||||
|              "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG", |              "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG", | ||||||
|              "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM", |              "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM", | ||||||
|              "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS", |              "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS", | ||||||
|              "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA", |              "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA", | ||||||
|              "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ", |              "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ", | ||||||
|              "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"] |              "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "14426_COPCI", |   { "name" : "14426_COPCI", | ||||||
|     "RefSeqID" : "XP_002911429", |     "RefSeqID" : "XP_002911429", | ||||||
|     "UniProtID" : "D6RMB0", |     "UniProtID" : "D6RMB0", | ||||||
|     "taxonomyID" : "240176", |     "taxonomyID" : "240176", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA", |              "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA", | ||||||
|              "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP", |              "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP", | ||||||
|              "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT", |              "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT", | ||||||
|              "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK", |              "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK", | ||||||
|              "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA", |              "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA", | ||||||
|              "ITYLPNFL"] |              "ITYLPNFL"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "BQT4_SCHPO", |   { "name" : "BQT4_SCHPO", | ||||||
|     "RefSeqID" : "NP_596166", |     "RefSeqID" : "NP_596166", | ||||||
|     "UniProtID" : "O60158", |     "UniProtID" : "O60158", | ||||||
|     "taxonomyID" : "284812", |     "taxonomyID" : "284812", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA", |              "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA", | ||||||
|              "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG", |              "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG", | ||||||
|              "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS", |              "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS", | ||||||
|              "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN", |              "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN", | ||||||
|              "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK", |              "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK", | ||||||
|              "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"] |              "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "PGTG_05590", |   { "name" : "PGTG_05590", | ||||||
|     "RefSeqID" : "XP_003323688", |     "RefSeqID" : "XP_003323688", | ||||||
|     "UniProtID" : "E3K4V4", |     "UniProtID" : "E3K4V4", | ||||||
|     "taxonomyID" : "418459", |     "taxonomyID" : "418459", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT", |              "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT", | ||||||
|              "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS", |              "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS", | ||||||
|              "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS", |              "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS", | ||||||
|              "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND", |              "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND", | ||||||
|              "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN", |              "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN", | ||||||
|              "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG", |              "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG", | ||||||
|              "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"] |              "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "06560_NEUCR", |   { "name" : "06560_NEUCR", | ||||||
|     "RefSeqID" : "XP_962267", |     "RefSeqID" : "XP_962267", | ||||||
|     "UniProtID" : "Q7S9H5", |     "UniProtID" : "Q7S9H5", | ||||||
|     "taxonomyID" : "367110", |     "taxonomyID" : "367110", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP", |              "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP", | ||||||
|              "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP", |              "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP", | ||||||
|              "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT", |              "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT", | ||||||
|              "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV", |              "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV", | ||||||
|              "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE", |              "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE", | ||||||
|              "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV", |              "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV", | ||||||
|              "ANVL"] |              "ANVL"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "81480_BIPOR", |   { "name" : "81480_BIPOR", | ||||||
|     "RefSeqID" : "XP_007682909", |     "RefSeqID" : "XP_007682909", | ||||||
|     "UniProtID" : "W6ZKJ4", |     "UniProtID" : "W6ZKJ4", | ||||||
|     "taxonomyID" : "930090", |     "taxonomyID" : "930090", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN", |              "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN", | ||||||
|              "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI", |              "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI", | ||||||
|              "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP", |              "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP", | ||||||
|              "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE", |              "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE", | ||||||
|              "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK", |              "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK", | ||||||
|              "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"] |              "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "01622_ASPNI", |   { "name" : "01622_ASPNI", | ||||||
|     "RefSeqID" : "XP_657766", |     "RefSeqID" : "XP_657766", | ||||||
|     "UniProtID" : "Q5BH18", |     "UniProtID" : "Q5BH18", | ||||||
|     "taxonomyID" : "227321", |     "taxonomyID" : "227321", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP", |              "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP", | ||||||
|              "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA", |              "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA", | ||||||
|              "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL", |              "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL", | ||||||
|              "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV", |              "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV", | ||||||
|              "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE", |              "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE", | ||||||
|              "RVRNRALMGVTAAFALAKPALVLLEA"] |              "RVRNRALMGVTAAFALAKPALVLLEA"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "05405_ASPNI", |   { "name" : "05405_ASPNI", | ||||||
|     "RefSeqID" : "XP_663009", |     "RefSeqID" : "XP_663009", | ||||||
|     "UniProtID" : "Q5B225", |     "UniProtID" : "Q5B225", | ||||||
|     "taxonomyID" : "227321", |     "taxonomyID" : "227321", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD", |              "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD", | ||||||
|              "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM", |              "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM", | ||||||
|              "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA", |              "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA", | ||||||
|              "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS", |              "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS", | ||||||
|              "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP", |              "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP", | ||||||
|              "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR", |              "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR", | ||||||
|              "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD", |              "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD", | ||||||
|              "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"] |              "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "105954_BIPOR", |   { "name" : "105954_BIPOR", | ||||||
|     "RefSeqID" : "XP_007691967", |     "RefSeqID" : "XP_007691967", | ||||||
|     "UniProtID" : "W6Z1H5", |     "UniProtID" : "W6Z1H5", | ||||||
|     "taxonomyID" : "930090", |     "taxonomyID" : "930090", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI", |              "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI", | ||||||
|              "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR", |              "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR", | ||||||
|              "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE", |              "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE", | ||||||
|              "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP", |              "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP", | ||||||
|              "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV", |              "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV", | ||||||
|              "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"] |              "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "69819_WALME", |   { "name" : "69819_WALME", | ||||||
|     "RefSeqID" : "XP_006959479", |     "RefSeqID" : "XP_006959479", | ||||||
|     "UniProtID" : "I4Y911", |     "UniProtID" : "I4Y911", | ||||||
|     "taxonomyID" : "671144", |     "taxonomyID" : "671144", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI", |              "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI", | ||||||
|              "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS", |              "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS", | ||||||
|              "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL", |              "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL", | ||||||
|              "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ", |              "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ", | ||||||
|              "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"] |              "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "02840_CRYNE", |   { "name" : "02840_CRYNE", | ||||||
|     "RefSeqID" : "XP_568872", |     "RefSeqID" : "XP_568872", | ||||||
|     "UniProtID" : "Q5KM59", |     "UniProtID" : "Q5KM59", | ||||||
|     "taxonomyID" : "214684", |     "taxonomyID" : "214684", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG", |              "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG", | ||||||
|              "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV", |              "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV", | ||||||
|              "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT", |              "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT", | ||||||
|              "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA", |              "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA", | ||||||
|              "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"] |              "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "11055_USTMA", |   { "name" : "11055_USTMA", | ||||||
|     "RefSeqID" : "XP_011390537", |     "RefSeqID" : "XP_011390537", | ||||||
|     "UniProtID" : "A0A0D1DZM8", |     "UniProtID" : "A0A0D1DZM8", | ||||||
|     "taxonomyID" : "237631", |     "taxonomyID" : "237631", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK", |              "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK", | ||||||
|              "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF", |              "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF", | ||||||
|              "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV", |              "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV", | ||||||
|              "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR", |              "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR", | ||||||
|              "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV", |              "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV", | ||||||
|              "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL", |              "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL", | ||||||
|              "ASILPW"] |              "ASILPW"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "XBP1_NEUCR", |   { "name" : "XBP1_NEUCR", | ||||||
|     "RefSeqID" : "XP_962373", |     "RefSeqID" : "XP_962373", | ||||||
|     "UniProtID" : "Q7S9W7", |     "UniProtID" : "Q7S9W7", | ||||||
|     "taxonomyID" : "367110", |     "taxonomyID" : "367110", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT", |              "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT", | ||||||
|              "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR", |              "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR", | ||||||
|              "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH", |              "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH", | ||||||
|              "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP", |              "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP", | ||||||
|              "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ", |              "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ", | ||||||
|              "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT", |              "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT", | ||||||
|              "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT", |              "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT", | ||||||
|              "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD", |              "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD", | ||||||
|              "DEGDYEHEQQYRRKRRRLLLVGRAKSF"] |              "DEGDYEHEQQYRRKRRRLLLVGRAKSF"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "XBP1_SACCE", |   { "name" : "XBP1_SACCE", | ||||||
|     "RefSeqID" : "NP_012165", |     "RefSeqID" : "NP_012165", | ||||||
|     "UniProtID" : "P40489", |     "UniProtID" : "P40489", | ||||||
|     "taxonomyID" : "559292", |     "taxonomyID" : "559292", | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|              "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA", |              "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA", | ||||||
|              "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR", |              "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR", | ||||||
|              "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE", |              "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE", | ||||||
|              "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS", |              "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS", | ||||||
|              "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ", |              "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ", | ||||||
|              "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND", |              "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND", | ||||||
|              "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY", |              "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY", | ||||||
|              "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK", |              "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK", | ||||||
|              "FKTNSKQ"] |              "FKTNSKQ"] | ||||||
|   } |   } | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -1,116 +1,116 @@ | |||||||
| [ | [ | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, |   {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"}, |   {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"}, |   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"}, |   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"}, |   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"}, |   {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"}, |   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"}, |   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"}, |   {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"}, |   {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"}, | ||||||
|   {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"}, |   {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"}, | ||||||
|  |  | ||||||
|   {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"}, |   {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"}, | ||||||
|   {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"}, |   {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"}, | ||||||
|   {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"}, |   {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"}, | ||||||
|   {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"}, |   {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"}, | ||||||
|   {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"}, |   {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"}, | ||||||
|   {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"}, |   {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"}, | ||||||
|   {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"}, |   {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"}, | ||||||
|  |  | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"}, |   {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"}, | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"}, |   {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"}, | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"}, |   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"}, | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"}, |   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"}, | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"}, |   {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"}, | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"}, |   {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"}, | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"}, |   {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"}, | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"}, |   {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"}, | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"}, |   {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"}, | ||||||
|   {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"}, |   {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"}, | ||||||
|  |  | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"}, |   {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"}, |   {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"}, |   {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"}, |   {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"}, |   {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"}, |   {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"}, | ||||||
|   {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"}, |   {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"}, | ||||||
|  |  | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"}, |   {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"}, | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"}, |   {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"}, | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"}, |   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"}, | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"}, |   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"}, | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"}, |   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"}, | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"}, |   {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"}, | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"}, |   {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"}, | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"}, |   {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"}, | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"}, |   {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"}, | ||||||
|   {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"}, |   {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"}, | ||||||
|  |  | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"}, |   {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"}, |   {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"}, |   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"}, |   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"}, |   {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"}, |   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"}, |   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"}, |   {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"}, |   {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"}, |   {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"}, | ||||||
|   {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"}, |   {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"}, | ||||||
|  |  | ||||||
|   {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"}, |   {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"}, | ||||||
|   {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"}, |   {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"}, | ||||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"}, |   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"}, | ||||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"}, |   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"}, | ||||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"}, |   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"}, | ||||||
|   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"}, |   {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"}, | ||||||
|   {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"}, |   {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"}, | ||||||
|   {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"}, |   {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"}, | ||||||
|   {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"}, |   {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"}, | ||||||
|  |  | ||||||
|   {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"}, |   {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"}, | ||||||
|   {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"}, |   {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"}, | ||||||
|   {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"}, |   {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"}, | ||||||
|   {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"}, |   {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"}, | ||||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"}, |   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"}, | ||||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"}, |   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"}, | ||||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"}, |   {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"}, | ||||||
|   {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"}, |   {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"}, | ||||||
|   {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"}, |   {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"}, | ||||||
|  |  | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"}, |   {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"}, |   {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"}, |   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"}, |   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"}, |   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"}, |   {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"}, |   {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"}, |   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"}, |   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"}, |   {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"}, |   {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"}, | ||||||
|   {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"}, |   {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"}, | ||||||
|  |  | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"}, |   {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"}, | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"}, |   {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"}, | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"}, |   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"}, | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"}, |   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"}, | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"}, |   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"}, | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"}, |   {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"}, | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"}, |   {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"}, | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"}, |   {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"}, | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"}, |   {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"}, | ||||||
|   {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"} |   {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"} | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -1,47 +1,47 @@ | |||||||
| [ | [ | ||||||
|   { "name" : "APSES fold", |   { "name" : "APSES fold", | ||||||
|     "description " : "DNA binding domain by similarity to structure", |     "description " : "DNA binding domain by similarity to structure", | ||||||
|     "sourceDB" : "PDB", |     "sourceDB" : "PDB", | ||||||
|     "accession" : "1BM8_A_1_99"}, |     "accession" : "1BM8_A_1_99"}, | ||||||
|  |  | ||||||
|   { "name" : "KilA-N", |   { "name" : "KilA-N", | ||||||
|     "description " : "DNA binding domain by Pfam annotation", |     "description " : "DNA binding domain by Pfam annotation", | ||||||
|     "sourceDB" : "Pfam", |     "sourceDB" : "Pfam", | ||||||
|     "accession" : "PF04383"}, |     "accession" : "PF04383"}, | ||||||
|  |  | ||||||
|   { "name" : "AT hook", |   { "name" : "AT hook", | ||||||
|     "description " : "DNA interaction motif by SMART annotation", |     "description " : "DNA interaction motif by SMART annotation", | ||||||
|     "sourceDB" : "SMART", |     "sourceDB" : "SMART", | ||||||
|     "accession" : null}, |     "accession" : null}, | ||||||
|  |  | ||||||
|   { "name" : "low complexity", |   { "name" : "low complexity", | ||||||
|     "description " : "SEG annotation by SMART", |     "description " : "SEG annotation by SMART", | ||||||
|     "sourceDB" : "SMART", |     "sourceDB" : "SMART", | ||||||
|     "accession" : null}, |     "accession" : null}, | ||||||
|  |  | ||||||
|   { "name" : "Ankyrin fold", |   { "name" : "Ankyrin fold", | ||||||
|     "description " : "Ankyrin domain by SMART annotation", |     "description " : "Ankyrin domain by SMART annotation", | ||||||
|     "sourceDB" : "SMART", |     "sourceDB" : "SMART", | ||||||
|     "accession" : "SM00248"}, |     "accession" : "SM00248"}, | ||||||
|  |  | ||||||
|   { "name" : "Swi6 fold", |   { "name" : "Swi6 fold", | ||||||
|     "description " : "Swi6 fold by similarity to structure", |     "description " : "Swi6 fold by similarity to structure", | ||||||
|     "sourceDB" : "PDB", |     "sourceDB" : "PDB", | ||||||
|     "accession" : "1SW6_B"}, |     "accession" : "1SW6_B"}, | ||||||
|  |  | ||||||
|   { "name" : "coiled coil", |   { "name" : "coiled coil", | ||||||
|     "description " : "Coiled coil by SMART annotation", |     "description " : "Coiled coil by SMART annotation", | ||||||
|     "sourceDB" : "SMART", |     "sourceDB" : "SMART", | ||||||
|     "accession" : null}, |     "accession" : null}, | ||||||
|  |  | ||||||
|   { "name" : "McInerny 2011", |   { "name" : "McInerny 2011", | ||||||
|     "description " : "Yeast cell cycle review", |     "description " : "Yeast cell cycle review", | ||||||
|     "sourceDB" : "PubMed", |     "sourceDB" : "PubMed", | ||||||
|     "accession" : "21310294"} |     "accession" : "21310294"} | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,155 +1,155 @@ | |||||||
| [ | [ | ||||||
|   { "name" : "MBP1_SCHPO", |   { "name" : "MBP1_SCHPO", | ||||||
|     "RefSeqID" : "NP_593032", |     "RefSeqID" : "NP_593032", | ||||||
|     "UniProtID" : "P41412", |     "UniProtID" : "P41412", | ||||||
|     "taxonomyID" : 284812, |     "taxonomyID" : 284812, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ", |        "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ", | ||||||
|        "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS", |        "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS", | ||||||
|        "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK", |        "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK", | ||||||
|        "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL", |        "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL", | ||||||
|        "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI", |        "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI", | ||||||
|        "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL", |        "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL", | ||||||
|        "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL", |        "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL", | ||||||
|        "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS", |        "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS", | ||||||
|        "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI", |        "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI", | ||||||
|        "AMSCGINPEDLSLEILDAVEEALTREK"] |        "AMSCGINPEDLSLEILDAVEEALTREK"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "MBP1_ASPNI", |   { "name" : "MBP1_ASPNI", | ||||||
|     "RefSeqID" : "XP_660758", |     "RefSeqID" : "XP_660758", | ||||||
|     "UniProtID" : "Q5B8H6", |     "UniProtID" : "Q5B8H6", | ||||||
|     "taxonomyID" : 227321, |     "taxonomyID" : 227321, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV", |        "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV", | ||||||
|        "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV", |        "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV", | ||||||
|        "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA", |        "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA", | ||||||
|        "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL", |        "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL", | ||||||
|        "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS", |        "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS", | ||||||
|        "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS", |        "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS", | ||||||
|        "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL", |        "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL", | ||||||
|        "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE", |        "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE", | ||||||
|        "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF", |        "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF", | ||||||
|        "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"] |        "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "MBP1_BIPOR", |   { "name" : "MBP1_BIPOR", | ||||||
|     "RefSeqID" : "XP_007682304", |     "RefSeqID" : "XP_007682304", | ||||||
|     "UniProtID" : "W6ZM86", |     "UniProtID" : "W6ZM86", | ||||||
|     "taxonomyID" : 930090, |     "taxonomyID" : 930090, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV", |        "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV", | ||||||
|        "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA", |        "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA", | ||||||
|        "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG", |        "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG", | ||||||
|        "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV", |        "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV", | ||||||
|        "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS", |        "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS", | ||||||
|        "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK", |        "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK", | ||||||
|        "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA", |        "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA", | ||||||
|        "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND", |        "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND", | ||||||
|        "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV", |        "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV", | ||||||
|        "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"] |        "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "MBP1_NEUCR", |   { "name" : "MBP1_NEUCR", | ||||||
|     "RefSeqID" : "XP_955821", |     "RefSeqID" : "XP_955821", | ||||||
|     "UniProtID" : "Q7RW59", |     "UniProtID" : "Q7RW59", | ||||||
|     "taxonomyID" : 367110, |     "taxonomyID" : 367110, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV", |        "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV", | ||||||
|        "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV", |        "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV", | ||||||
|        "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS", |        "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS", | ||||||
|        "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG", |        "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG", | ||||||
|        "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK", |        "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK", | ||||||
|        "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP", |        "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP", | ||||||
|        "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV", |        "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV", | ||||||
|        "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE", |        "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE", | ||||||
|        "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE", |        "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE", | ||||||
|        "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA", |        "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA", | ||||||
|        "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG", |        "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG", | ||||||
|        "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"] |        "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "MBP1_COPCI", |   { "name" : "MBP1_COPCI", | ||||||
|     "RefSeqID" : "XP_001837394", |     "RefSeqID" : "XP_001837394", | ||||||
|     "UniProtID" : "A8NYC6", |     "UniProtID" : "A8NYC6", | ||||||
|     "taxonomyID" : 240176, |     "taxonomyID" : 240176, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG", |        "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG", | ||||||
|        "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN", |        "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN", | ||||||
|        "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY", |        "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY", | ||||||
|        "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA", |        "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA", | ||||||
|        "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD", |        "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD", | ||||||
|        "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN", |        "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN", | ||||||
|        "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST", |        "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST", | ||||||
|        "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE", |        "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE", | ||||||
|        "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ", |        "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ", | ||||||
|        "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA", |        "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA", | ||||||
|        "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"] |        "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "MBP1_CRYNE", |   { "name" : "MBP1_CRYNE", | ||||||
|     "RefSeqID" : "XP_569090", |     "RefSeqID" : "XP_569090", | ||||||
|     "UniProtID" : "Q5KMQ9", |     "UniProtID" : "Q5KMQ9", | ||||||
|     "taxonomyID" : 214684, |     "taxonomyID" : 214684, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV", |        "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV", | ||||||
|        "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE", |        "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE", | ||||||
|        "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM", |        "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM", | ||||||
|        "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG", |        "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG", | ||||||
|        "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT", |        "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT", | ||||||
|        "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK", |        "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK", | ||||||
|        "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE", |        "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE", | ||||||
|        "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE", |        "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE", | ||||||
|        "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR", |        "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR", | ||||||
|        "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ", |        "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ", | ||||||
|        "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"] |        "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "MBP1_PUCGR", |   { "name" : "MBP1_PUCGR", | ||||||
|     "RefSeqID" : "XP_003327086", |     "RefSeqID" : "XP_003327086", | ||||||
|     "UniProtID" : "E3KED4", |     "UniProtID" : "E3KED4", | ||||||
|     "taxonomyID" : 418459, |     "taxonomyID" : 418459, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY", |        "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY", | ||||||
|        "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE", |        "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE", | ||||||
|        "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK", |        "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK", | ||||||
|        "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN", |        "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN", | ||||||
|        "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS", |        "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS", | ||||||
|        "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS", |        "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS", | ||||||
|        "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH", |        "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH", | ||||||
|        "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE", |        "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE", | ||||||
|        "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG", |        "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG", | ||||||
|        "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD", |        "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD", | ||||||
|        "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR", |        "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR", | ||||||
|        "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE", |        "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE", | ||||||
|        "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL", |        "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL", | ||||||
|        "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"] |        "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "MBP1_USTMA", |   { "name" : "MBP1_USTMA", | ||||||
|     "RefSeqID" : "XP_011392621", |     "RefSeqID" : "XP_011392621", | ||||||
|     "UniProtID" : "A0A0D1DP35", |     "UniProtID" : "A0A0D1DP35", | ||||||
|     "taxonomyID" : 237631, |     "taxonomyID" : 237631, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG", |        "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG", | ||||||
|        "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS", |        "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS", | ||||||
|        "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY", |        "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY", | ||||||
|        "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ", |        "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ", | ||||||
|        "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY", |        "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY", | ||||||
|        "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP", |        "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP", | ||||||
|        "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE", |        "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE", | ||||||
|        "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS", |        "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS", | ||||||
|        "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM", |        "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM", | ||||||
|        "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA", |        "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA", | ||||||
|        "P"] |        "P"] | ||||||
|   }, |   }, | ||||||
|   { "name" : "MBP1_WALME", |   { "name" : "MBP1_WALME", | ||||||
|     "RefSeqID" : "XP_006957051", |     "RefSeqID" : "XP_006957051", | ||||||
|     "UniProtID" : "I4YGC0", |     "UniProtID" : "I4YGC0", | ||||||
|     "taxonomyID" : 671144, |     "taxonomyID" : 671144, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG", |        "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG", | ||||||
|        "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK", |        "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK", | ||||||
|        "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID", |        "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID", | ||||||
|        "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV", |        "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV", | ||||||
|        "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS", |        "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS", | ||||||
|        "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP", |        "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP", | ||||||
|        "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR", |        "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR", | ||||||
|        "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE", |        "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE", | ||||||
|        "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA", |        "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA", | ||||||
|        "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"] |        "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"] | ||||||
|   } |   } | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -1,22 +1,22 @@ | |||||||
| [ | [ | ||||||
|   { "ID" : 227321, |   { "ID" : 227321, | ||||||
|     "species" : "Aspergillus nidulans FGSC A4"}, |     "species" : "Aspergillus nidulans FGSC A4"}, | ||||||
|   { "ID" : 930090, |   { "ID" : 930090, | ||||||
|     "species" : "Bipolaris oryzae ATCC 44560"}, |     "species" : "Bipolaris oryzae ATCC 44560"}, | ||||||
|   { "ID" : 240176, |   { "ID" : 240176, | ||||||
|     "species" : "Coprinopsis cinerea okayama7#130"}, |     "species" : "Coprinopsis cinerea okayama7#130"}, | ||||||
|   { "ID" : 214684, |   { "ID" : 214684, | ||||||
|     "species" : "Cryptococcus neoformans var. neoformans JEC21"}, |     "species" : "Cryptococcus neoformans var. neoformans JEC21"}, | ||||||
|   { "ID" : 367110, |   { "ID" : 367110, | ||||||
|     "species" : "Neurospora crassa OR74A"}, |     "species" : "Neurospora crassa OR74A"}, | ||||||
|   { "ID" : 418459, |   { "ID" : 418459, | ||||||
|     "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"}, |     "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"}, | ||||||
|   { "ID" : 559292, |   { "ID" : 559292, | ||||||
|     "species" : "Saccharomyces cerevisiae S288C"}, |     "species" : "Saccharomyces cerevisiae S288C"}, | ||||||
|   { "ID" : 284812, |   { "ID" : 284812, | ||||||
|     "species" : "Schizosaccharomyces pombe 972h-"}, |     "species" : "Schizosaccharomyces pombe 972h-"}, | ||||||
|   { "ID" : 237631, |   { "ID" : 237631, | ||||||
|     "species" : "Ustilago maydis 521"}, |     "species" : "Ustilago maydis 521"}, | ||||||
|   { "ID" : 671144, |   { "ID" : 671144, | ||||||
|     "species" : "Wallemia mellicola CBS 633.66"} |     "species" : "Wallemia mellicola CBS 633.66"} | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -1,115 +1,115 @@ | |||||||
| ID	protein.ID	feature.ID	start	end	note | ID	protein.ID	feature.ID	start	end	note | ||||||
| # MBP1_SACCE | # MBP1_SACCE | ||||||
| NA	ref_pro_4	ref_ftr_1	4	102	APSES fold | NA	ref_pro_4	ref_ftr_1	4	102	APSES fold | ||||||
| NA	ref_pro_4	ref_ftr_2	22	105	KilA-N | NA	ref_pro_4	ref_ftr_2	22	105	KilA-N | ||||||
| NA	ref_pro_4	ref_ftr_4	108	122	low complexity | NA	ref_pro_4	ref_ftr_4	108	122	low complexity | ||||||
| NA	ref_pro_4	ref_ftr_4	236	241	low complexity | NA	ref_pro_4	ref_ftr_4	236	241	low complexity | ||||||
| NA	ref_pro_4	ref_ftr_4	279	307	low complexity | NA	ref_pro_4	ref_ftr_4	279	307	low complexity | ||||||
| NA	ref_pro_4	ref_ftr_4	700	717	low complexity | NA	ref_pro_4	ref_ftr_4	700	717	low complexity | ||||||
| NA	ref_pro_4	ref_ftr_4	700	717	low complexity | NA	ref_pro_4	ref_ftr_4	700	717	low complexity | ||||||
| NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin | NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin | ||||||
| NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin | NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin | ||||||
| NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin | NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin | ||||||
| NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold | NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold | ||||||
| NA	ref_pro_4	ref_ftr_7	633	655	coiled coil | NA	ref_pro_4	ref_ftr_7	633	655	coiled coil | ||||||
| # MBP1_ASPNI | # MBP1_ASPNI | ||||||
| NA	ref_pro_1	ref_ftr_1	9	106	APSES fold | NA	ref_pro_1	ref_ftr_1	9	106	APSES fold | ||||||
| NA	ref_pro_1	ref_ftr_2	26	109	KilA-N | NA	ref_pro_1	ref_ftr_2	26	109	KilA-N | ||||||
| NA	ref_pro_1	ref_ftr_4	529	534	low complexity | NA	ref_pro_1	ref_ftr_4	529	534	low complexity | ||||||
| NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin | NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin | ||||||
| NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin | NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin | ||||||
| NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold | NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold | ||||||
| NA	ref_pro_1	ref_ftr_7	509	572	coiled coil | NA	ref_pro_1	ref_ftr_7	509	572	coiled coil | ||||||
| # MBP1_BIPOR | # MBP1_BIPOR | ||||||
| NA	ref_pro_2	ref_ftr_1	8	106	APSES fold | NA	ref_pro_2	ref_ftr_1	8	106	APSES fold | ||||||
| NA	ref_pro_2	ref_ftr_2	26	109	KilA-N | NA	ref_pro_2	ref_ftr_2	26	109	KilA-N | ||||||
| NA	ref_pro_2	ref_ftr_4	134	152	low complexity | NA	ref_pro_2	ref_ftr_4	134	152	low complexity | ||||||
| NA	ref_pro_2	ref_ftr_4	267	278	low complexity | NA	ref_pro_2	ref_ftr_4	267	278	low complexity | ||||||
| NA	ref_pro_2	ref_ftr_4	670	685	low complexity | NA	ref_pro_2	ref_ftr_4	670	685	low complexity | ||||||
| NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin | NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin | ||||||
| NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin | NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin | ||||||
| NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold | NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold | ||||||
| NA	ref_pro_2	ref_ftr_7	659	681	coiled coil | NA	ref_pro_2	ref_ftr_7	659	681	coiled coil | ||||||
| NA	ref_pro_2	ref_ftr_7	500	590	coiled coil | NA	ref_pro_2	ref_ftr_7	500	590	coiled coil | ||||||
| # MBP1_NEUCR | # MBP1_NEUCR | ||||||
| NA	ref_pro_3	ref_ftr_1	14	114	APSES fold | NA	ref_pro_3	ref_ftr_1	14	114	APSES fold | ||||||
| NA	ref_pro_3	ref_ftr_2	34	117	KilA-N | NA	ref_pro_3	ref_ftr_2	34	117	KilA-N | ||||||
| NA	ref_pro_3	ref_ftr_4	130	141	low complexity | NA	ref_pro_3	ref_ftr_4	130	141	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_4	253	266	low complexity | NA	ref_pro_3	ref_ftr_4	253	266	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_4	514	525	low complexity | NA	ref_pro_3	ref_ftr_4	514	525	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_4	554	564	low complexity | NA	ref_pro_3	ref_ftr_4	554	564	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_4	601	618	low complexity | NA	ref_pro_3	ref_ftr_4	601	618	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_4	620	629	low complexity | NA	ref_pro_3	ref_ftr_4	620	629	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_4	636	652	low complexity | NA	ref_pro_3	ref_ftr_4	636	652	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_4	658	672	low complexity | NA	ref_pro_3	ref_ftr_4	658	672	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_4	725	735	low complexity | NA	ref_pro_3	ref_ftr_4	725	735	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_4	752	771	low complexity | NA	ref_pro_3	ref_ftr_4	752	771	low complexity | ||||||
| NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin | NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin | ||||||
| NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin | NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin | ||||||
| NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold | NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold | ||||||
| NA	ref_pro_3	ref_ftr_7	500	550	coiled coil | NA	ref_pro_3	ref_ftr_7	500	550	coiled coil | ||||||
| # MBP1_SCHPO | # MBP1_SCHPO | ||||||
| NA	ref_pro_5	ref_ftr_1	8	104	APSES fold | NA	ref_pro_5	ref_ftr_1	8	104	APSES fold | ||||||
| NA	ref_pro_5	ref_ftr_2	25	113	KilA-N | NA	ref_pro_5	ref_ftr_2	25	113	KilA-N | ||||||
| NA	ref_pro_5	ref_ftr_4	111	125	low complexity | NA	ref_pro_5	ref_ftr_4	111	125	low complexity | ||||||
| NA	ref_pro_5	ref_ftr_4	136	145	low complexity | NA	ref_pro_5	ref_ftr_4	136	145	low complexity | ||||||
| NA	ref_pro_5	ref_ftr_4	176	191	low complexity | NA	ref_pro_5	ref_ftr_4	176	191	low complexity | ||||||
| NA	ref_pro_5	ref_ftr_4	422	447	low complexity | NA	ref_pro_5	ref_ftr_4	422	447	low complexity | ||||||
| NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin | NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin | ||||||
| NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin | NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin | ||||||
| NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold | NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold | ||||||
| NA	ref_pro_5	ref_ftr_7	457	538	coiled coil | NA	ref_pro_5	ref_ftr_7	457	538	coiled coil | ||||||
| # MBP1_COPCI | # MBP1_COPCI | ||||||
| NA	ref_pro_6	ref_ftr_1	5	103	APSES fold | NA	ref_pro_6	ref_ftr_1	5	103	APSES fold | ||||||
| NA	ref_pro_6	ref_ftr_2	23	106	KilA-N | NA	ref_pro_6	ref_ftr_2	23	106	KilA-N | ||||||
| NA	ref_pro_6	ref_ftr_4	170	191	low complexity | NA	ref_pro_6	ref_ftr_4	170	191	low complexity | ||||||
| NA	ref_pro_6	ref_ftr_4	435	450	low complexity | NA	ref_pro_6	ref_ftr_4	435	450	low complexity | ||||||
| NA	ref_pro_6	ref_ftr_4	611	626	low complexity | NA	ref_pro_6	ref_ftr_4	611	626	low complexity | ||||||
| NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin | NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin | ||||||
| NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin | NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin | ||||||
| NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin | NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin | ||||||
| NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold | NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold | ||||||
| NA	ref_pro_6	ref_ftr_7	500	570	coiled coil | NA	ref_pro_6	ref_ftr_7	500	570	coiled coil | ||||||
| NA	ref_pro_6	ref_ftr_7	651	678	coiled coil | NA	ref_pro_6	ref_ftr_7	651	678	coiled coil | ||||||
| # MBP1_CRYNE | # MBP1_CRYNE | ||||||
| NA	ref_pro_7	ref_ftr_1	113	211	APSES fold | NA	ref_pro_7	ref_ftr_1	113	211	APSES fold | ||||||
| NA	ref_pro_7	ref_ftr_2	131	215	KilA-N | NA	ref_pro_7	ref_ftr_2	131	215	KilA-N | ||||||
| NA	ref_pro_7	ref_ftr_4	66	85	low complexity | NA	ref_pro_7	ref_ftr_4	66	85	low complexity | ||||||
| NA	ref_pro_7	ref_ftr_4	413	423	low complexity | NA	ref_pro_7	ref_ftr_4	413	423	low complexity | ||||||
| NA	ref_pro_7	ref_ftr_4	633	644	low complexity | NA	ref_pro_7	ref_ftr_4	633	644	low complexity | ||||||
| NA	ref_pro_7	ref_ftr_4	697	709	low complexity | NA	ref_pro_7	ref_ftr_4	697	709	low complexity | ||||||
| NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin | NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin | ||||||
| NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin | NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin | ||||||
| NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold | NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold | ||||||
| # MBP1_PUCGR | # MBP1_PUCGR | ||||||
| NA	ref_pro_8	ref_ftr_1	90	187	APSES fold | NA	ref_pro_8	ref_ftr_1	90	187	APSES fold | ||||||
| NA	ref_pro_8	ref_ftr_2	107	190	KilA-N | NA	ref_pro_8	ref_ftr_2	107	190	KilA-N | ||||||
| NA	ref_pro_8	ref_ftr_4	208	227	low complexity | NA	ref_pro_8	ref_ftr_4	208	227	low complexity | ||||||
| NA	ref_pro_8	ref_ftr_4	273	291	low complexity | NA	ref_pro_8	ref_ftr_4	273	291	low complexity | ||||||
| NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin | NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin | ||||||
| NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin | NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin | ||||||
| NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin | NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin | ||||||
| NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold | NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold | ||||||
| NA	ref_pro_8	ref_ftr_7	827	863	coiled coil | NA	ref_pro_8	ref_ftr_7	827	863	coiled coil | ||||||
| # MBP1_USTMA | # MBP1_USTMA | ||||||
| NA	ref_pro_9	ref_ftr_1	7	104	APSES fold | NA	ref_pro_9	ref_ftr_1	7	104	APSES fold | ||||||
| NA	ref_pro_9	ref_ftr_2	24	107	KilA-N | NA	ref_pro_9	ref_ftr_2	24	107	KilA-N | ||||||
| NA	ref_pro_9	ref_ftr_4	106	116	low complexity | NA	ref_pro_9	ref_ftr_4	106	116	low complexity | ||||||
| NA	ref_pro_9	ref_ftr_4	161	183	low complexity | NA	ref_pro_9	ref_ftr_4	161	183	low complexity | ||||||
| NA	ref_pro_9	ref_ftr_4	657	672	low complexity | NA	ref_pro_9	ref_ftr_4	657	672	low complexity | ||||||
| NA	ref_pro_9	ref_ftr_4	776	796	low complexity | NA	ref_pro_9	ref_ftr_4	776	796	low complexity | ||||||
| NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin | NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin | ||||||
| NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin | NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin | ||||||
| NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold | NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold | ||||||
| NA	ref_pro_9	ref_ftr_7	581	609	coiled coil | NA	ref_pro_9	ref_ftr_7	581	609	coiled coil | ||||||
| # MBP1_WALME | # MBP1_WALME | ||||||
| NA	ref_pro_10	ref_ftr_1	6	103	APSES fold | NA	ref_pro_10	ref_ftr_1	6	103	APSES fold | ||||||
| NA	ref_pro_10	ref_ftr_2	23	106	KilA-N | NA	ref_pro_10	ref_ftr_2	23	106	KilA-N | ||||||
| NA	ref_pro_10	ref_ftr_4	149	162	low complexity | NA	ref_pro_10	ref_ftr_4	149	162	low complexity | ||||||
| NA	ref_pro_10	ref_ftr_4	171	188	low complexity | NA	ref_pro_10	ref_ftr_4	171	188	low complexity | ||||||
| NA	ref_pro_10	ref_ftr_4	618	628	low complexity | NA	ref_pro_10	ref_ftr_4	618	628	low complexity | ||||||
| NA	ref_pro_10	ref_ftr_4	634	660	low complexity | NA	ref_pro_10	ref_ftr_4	634	660	low complexity | ||||||
| NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin | NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin | ||||||
| NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin | NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin | ||||||
| NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold | NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold | ||||||
| NA	ref_pro_10	ref_ftr_7	461	585	coiled coil | NA	ref_pro_10	ref_ftr_7	461	585	coiled coil | ||||||
|   | |||||||
| @@ -1,37 +1,37 @@ | |||||||
| # functionTemplate.R | # functionTemplate.R | ||||||
| # | # | ||||||
| # Purpose:  (General) | # Purpose:  (General) | ||||||
| # | # | ||||||
| # ToDo: | # ToDo: | ||||||
| # Notes: | # Notes: | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
| myFunction <- function(a, b=1) { | myFunction <- function(a, b=1) { | ||||||
| 	# Purpose: | 	# Purpose: | ||||||
| 	#     Describe ... | 	#     Describe ... | ||||||
|     # Version: |     # Version: | ||||||
|     # Date: |     # Date: | ||||||
|     # Author: |     # Author: | ||||||
|     # |     # | ||||||
|     # Parameters: |     # Parameters: | ||||||
| 	#     a: ... | 	#     a: ... | ||||||
| 	#     b: ... | 	#     b: ... | ||||||
| 	# Value: | 	# Value: | ||||||
| 	#     result: ... | 	#     result: ... | ||||||
| 	# Example: <example invocation> | 	# Example: <example invocation> | ||||||
|  |  | ||||||
| 	# code ... | 	# code ... | ||||||
|  |  | ||||||
| 	return(result) | 	return(result) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # ====  TESTS  ================================================================= | # ====  TESTS  ================================================================= | ||||||
| # Enter your function tests here... | # Enter your function tests here... | ||||||
|  |  | ||||||
| if (FALSE) { | if (FALSE) { | ||||||
|   # test ... |   # test ... | ||||||
| } | } | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,21 +1,21 @@ | |||||||
| # .myProfile.R | # .myProfile.R | ||||||
| # This contains information which the course framework needs from time to time | # This contains information which the course framework needs from time to time | ||||||
| # to personalize assignments, validate submissions etc. Make sure that | # to personalize assignments, validate submissions etc. Make sure that | ||||||
| # the information correctly matches our official records. | # the information correctly matches our official records. | ||||||
| # myEmail          char      A string with your eMail address. Use your official | # myEmail          char      A string with your eMail address. Use your official | ||||||
| #                            UofT eMail address. | #                            UofT eMail address. | ||||||
| # myStudentNumber  numeric   Your UofT student number. Take care to have this | # myStudentNumber  numeric   Your UofT student number. Take care to have this | ||||||
| #                            correct. | #                            correct. | ||||||
| # | # | ||||||
| # NOTE: | # NOTE: | ||||||
| # After you have updated this script, move the file to your "myScripts" folder. | # After you have updated this script, move the file to your "myScripts" folder. | ||||||
| # Utility scripts will look for it on the path: "./myScripts/.myProfile.R" | # Utility scripts will look for it on the path: "./myScripts/.myProfile.R" | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
| # options(stringsAsFactors = FALSE) | # options(stringsAsFactors = FALSE) | ||||||
|  |  | ||||||
| myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca" | myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca" | ||||||
| myStudentNumber <- 1005845285  # e.g. 1003141592 | myStudentNumber <- 1005845285  # e.g. 1003141592 | ||||||
| MYSPE <- "Cutaneotrichosporon oleaginosum"  | MYSPE <- "Cutaneotrichosporon oleaginosum"  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,54 +1,51 @@ | |||||||
| myFA <-             readFASTA("data/RAB39B_HSa_coding.fa") | gen_mutations <- function(seq, N) { | ||||||
| myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa")) |   sealKey() # See: http://steipe.biochemistry.utoronto.ca/abc/index.php/BCH441_Code_submisson_instructions | ||||||
| myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa")) |   stats <- c() | ||||||
| myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa")) |   stats <- cbind(stats, c(0, 0, 0)) | ||||||
| rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names |   rownames(stats) <- c("silent", "missense", "nonsense") | ||||||
|  |   colnames(stats) <- c("occurrences") | ||||||
| gen_mutations <- function(seq, N) { |   # Actual function | ||||||
|   stats <- c() |   for (i in 1:N) { | ||||||
|   stats <- cbind(stats, c(0, 0, 0)) |     original_seq <- Biostrings::DNAString(seq) | ||||||
|   rownames(stats) <- c("silent", "missense", "nonsense") |     aa_seq <- Biostrings::translate(original_seq, no.init.codon = TRUE) | ||||||
|   colnames(stats) <- c("occurrences") |  | ||||||
|   # Actual function |     mut_seq <- Biostrings::DNAString(seq) | ||||||
|   for (i in 1:217) { |     mut_index <- sample(1:length(original_seq), 1, replace = TRUE) | ||||||
|     # select index for mutation |     possible_mutations <- Biostrings::DNA_BASES | ||||||
|     working_seq <- Biostrings::DNAString(seq) |     possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(original_seq[mut_index]))] | ||||||
|     aa_seq <- Biostrings::translate(working_seq, no.init.codon = TRUE) |     mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, sample(possible_mutations, 1, replace = TRUE)) | ||||||
|     mut_action <- sample(c("ins", "del", "sub"), 1, TRUE) |     mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE) | ||||||
|     mut_seq <- Biostrings::DNAString(seq) |  | ||||||
|     if (mut_action == "sub") { |  | ||||||
|       mut_index <- sample(1:length(working_seq), 1, replace = TRUE) |     term_aa <- regexpr(pattern = "\\*", aa_seq) | ||||||
|       possible_mutations <- Biostrings::DNA_BASES |     term_mut_aa <- as.integer(regexpr(pattern = "\\*", mut_aa)) | ||||||
|       possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(working_seq[mut_index]))] |     if ((term_aa == -1 && term_mut_aa != -1) || (term_mut_aa != -1 && term_mut_aa < term_aa)) { | ||||||
|       mut_change <- sample(possible_mutations, 1, replace = TRUE) |       stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"] | ||||||
|       mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, mut_change) |     } else if (mut_aa == aa_seq) { | ||||||
|     } else if (mut_action == "ins") { |       stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"] | ||||||
|       mut_index <- sample(1:length(working_seq) - 2, 1, replace = TRUE) |     } else { | ||||||
|       possible_mutations <- Biostrings::DNA_BASES |       stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"] | ||||||
|       mut_seq <- Biostrings::DNAString(paste(substring(working_seq, 1, mut_index - 1), sample(possible_mutations, 1), substring(working_seq, mut_index), sep = "")) |     } | ||||||
|     } else { |   } | ||||||
|       mut_index <- sample(1:length(working_seq), 1, replace = TRUE) |   sealKey() | ||||||
|       mut_seq <- mut_seq[-mut_index] |   return(stats) | ||||||
|     } | } | ||||||
|     mut_seq <- Biostrings::DNAString(substring(mut_seq, 1, length(mut_seq) - (length(mut_seq) %% 3))) |  | ||||||
|     mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE) | gen_mutations("ATGATGATGATGATGATG", 1000) | ||||||
|  | gen_mutations("CCCCCCCCCCCCCCCCCC", 500) | ||||||
|     # Note: we need silent, nonsense, and missense | gen_mutations("TATTACTATTACTATTAC", 500) | ||||||
|     mut_aa_stop <- match("*", Biostrings::as.matrix(mut_aa)) | gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", 500) | ||||||
|     aa_seq_stop <- match("*", Biostrings::as.matrix(aa_seq)) | gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", 500) | ||||||
|     if (!is.na(mut_aa_stop) & (is.na(aa_seq_stop) | mut_aa_stop < aa_seq_stop)) { | gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGA", 500) | ||||||
|       stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"] |  | ||||||
|     } else if (mut_aa == aa_seq) { |  | ||||||
|       stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"] | myFA <-             readFASTA("data/RAB39B_HSa_coding.fa") | ||||||
|     } else { | myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa")) | ||||||
|       stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"] | myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa")) | ||||||
|     } | myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa")) | ||||||
|   } | rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names | ||||||
|   return(stats) |  | ||||||
| } | gen_mutations(myFA["RAB39B", 2], 10000) | ||||||
| N_test <- 1200 | gen_mutations(myFA["PTPN5", 2], 10000) | ||||||
| gen_mutations("ATGATGATGATGATGATG", N_test) | gen_mutations(myFA["PTPN11", 2], 10000) | ||||||
| gen_mutations("CCCCCCCCCCCCCCCCCC", N_test) | gen_mutations(myFA["KRAS", 2], 10000) | ||||||
| gen_mutations("TATTACTATTACTATTAC", N_test) |  | ||||||
| gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", N_test) |  | ||||||
| gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", N_test) |  | ||||||
|   | |||||||
| @@ -1,41 +1,41 @@ | |||||||
| # ==   1.3  Task: submit for credit (part 1/2)  ================================ | # ==   1.3  Task: submit for credit (part 1/2)  ================================ | ||||||
| # == Submission - Code to add another philosopher to the datamodel: | # == Submission - Code to add another philosopher to the datamodel: | ||||||
|  |  | ||||||
| pID <- autoincrement(philDB$person) | pID <- autoincrement(philDB$person) | ||||||
| immanuelKant <- data.frame(id = pID, | immanuelKant <- data.frame(id = pID, | ||||||
|                            name = "Immanuel Kant", |                            name = "Immanuel Kant", | ||||||
|                            born = "1724", |                            born = "1724", | ||||||
|                            died = "1804", |                            died = "1804", | ||||||
|                            school = "Enlightenment Philosophy") |                            school = "Enlightenment Philosophy") | ||||||
| philDB$person <- rbind(philDB$person, immanuelKant) | philDB$person <- rbind(philDB$person, immanuelKant) | ||||||
|  |  | ||||||
| bID = autoincrement(philDB$books) | bID = autoincrement(philDB$books) | ||||||
| immanuelKantWork <- data.frame(id = bID, | immanuelKantWork <- data.frame(id = bID, | ||||||
|                                title = "Critique of Pure Reason", |                                title = "Critique of Pure Reason", | ||||||
|                                published = "1781") |                                published = "1781") | ||||||
| philDB$books <- rbind(philDB$books, immanuelKantWork) | philDB$books <- rbind(philDB$books, immanuelKantWork) | ||||||
| philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) | philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) | ||||||
|  |  | ||||||
| bID = autoincrement(philDB$books) | bID = autoincrement(philDB$books) | ||||||
| immanuelKantWork <- data.frame(id = bID, | immanuelKantWork <- data.frame(id = bID, | ||||||
|                                title = "Critique of Judgement", |                                title = "Critique of Judgement", | ||||||
|                                published = "1790") |                                published = "1790") | ||||||
| philDB$books <- rbind(philDB$books, immanuelKantWork) | philDB$books <- rbind(philDB$books, immanuelKantWork) | ||||||
| philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) | philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID)) | ||||||
|  |  | ||||||
| # == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order. | # == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order. | ||||||
|  |  | ||||||
| schools <- unique(philDB$person$school) | schools <- unique(philDB$person$school) | ||||||
| schools <- sort(schools) | schools <- sort(schools) | ||||||
|  |  | ||||||
| for (s in schools) { | for (s in schools) { | ||||||
|   cat(sprintf("%s\n", s)) |   cat(sprintf("%s\n", s)) | ||||||
|   authors = which(philDB$person$school == s) |   authors = which(philDB$person$school == s) | ||||||
|   for (author in authors) { |   for (author in authors) { | ||||||
|     works = which(philDB$works$personID == author) |     works = which(philDB$works$personID == author) | ||||||
|     for (work in works) { |     for (work in works) { | ||||||
|       bookId = which(philDB$books$id == philDB$works$bookID[work]) |       bookId = which(philDB$books$id == philDB$works$bookID[work]) | ||||||
|       cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId])) |       cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId])) | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| } | } | ||||||
| @@ -1,4 +1,4 @@ | |||||||
| [{ | [{ | ||||||
| 	"ID": 879819, | 	"ID": 879819, | ||||||
| 	"species": "Cutaneotrichosporon oleaginosum"} | 	"species": "Cutaneotrichosporon oleaginosum"} | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -1,19 +1,19 @@ | |||||||
| [ | [ | ||||||
|   { "name" : "MBP1_CUTOL", |   { "name" : "MBP1_CUTOL", | ||||||
|     "RefSeqID" : "XP_018278493.1", |     "RefSeqID" : "XP_018278493.1", | ||||||
|     "UniProtID" : "A0A0J0XLN0", |     "UniProtID" : "A0A0J0XLN0", | ||||||
|     "taxonomyID" : 879819, |     "taxonomyID" : 879819, | ||||||
|     "sequence" : [ |     "sequence" : [ | ||||||
|        "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ", |        "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ", | ||||||
|        "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK", |        "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK", | ||||||
|        "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS", |        "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS", | ||||||
|        "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA", |        "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA", | ||||||
|        "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN", |        "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN", | ||||||
|        "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD", |        "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD", | ||||||
|        "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT", |        "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT", | ||||||
|        "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND", |        "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND", | ||||||
|        "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD", |        "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD", | ||||||
|        "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID", |        "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID", | ||||||
|        "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"] |        "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"] | ||||||
|   } |   } | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -1,8 +1,8 @@ | |||||||
| README - myScripts folder: | README - myScripts folder: | ||||||
| ========================== | ========================== | ||||||
|  |  | ||||||
| The "myScripts" folder is a place to keep your personal files | The "myScripts" folder is a place to keep your personal files | ||||||
| safe. No files will be submitted into this folder on the GitHub, master | safe. No files will be submitted into this folder on the GitHub, master | ||||||
| copy. Thefore everything you put into this folder is safe from being | copy. Thefore everything you put into this folder is safe from being | ||||||
| inadvertently overwritten by a file with the same name that would be | inadvertently overwritten by a file with the same name that would be | ||||||
| downloaded in a GitHub "pull" request. | downloaded in a GitHub "pull" request. | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| source("./scripts/ABC-createRefDB.R") | source("./scripts/ABC-createRefDB.R") | ||||||
|  |  | ||||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json")) | myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json")) | ||||||
| myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json")) | myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json")) | ||||||
|   | |||||||
| @@ -1,38 +1,38 @@ | |||||||
| # myScript.R | # myScript.R | ||||||
| # | # | ||||||
| # --- As you work with this file, you can delete the instructions below -------- | # --- As you work with this file, you can delete the instructions below -------- | ||||||
| # Write your notes and code experiments into this document. Save it | # Write your notes and code experiments into this document. Save it | ||||||
| # from time to time - however I recommend that you do not _commit_ | # from time to time - however I recommend that you do not _commit_ | ||||||
| # your saved version. | # your saved version. | ||||||
| # | # | ||||||
| # As long as you do not _commit_ this script to version control, | # As long as you do not _commit_ this script to version control, | ||||||
| # you can _pull_ updated versions of the entire project from GitHub | # you can _pull_ updated versions of the entire project from GitHub | ||||||
| # by using the RStudio version control interface. However, once | # by using the RStudio version control interface. However, once | ||||||
| # you _commit_ any file in your local version, RStudio will require | # you _commit_ any file in your local version, RStudio will require | ||||||
| # you to resolve conflicts before you can _pull_ updates. | # you to resolve conflicts before you can _pull_ updates. | ||||||
| # --- As you work with this file, you can delete the instructions above -------- | # --- As you work with this file, you can delete the instructions above -------- | ||||||
| # | # | ||||||
| ## Purpose: <...> | ## Purpose: <...> | ||||||
| # | # | ||||||
| # Version: <...> | # Version: <...> | ||||||
| # | # | ||||||
| # Date:    <...> | # Date:    <...> | ||||||
| # Author:  <Name> (<namee@mail.utoronto.ca>) | # Author:  <Name> (<namee@mail.utoronto.ca>) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| # | # | ||||||
| #   <number>    <Features> | #   <number>    <Features> | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| #   <...> | #   <...> | ||||||
| # | # | ||||||
| # ==================================================================== | # ==================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										2868
									
								
								plottingReference.R
									
									
									
									
									
								
							
							
						
						
									
										2868
									
								
								plottingReference.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										150
									
								
								scriptTemplate.R
									
									
									
									
									
								
							
							
						
						
									
										150
									
								
								scriptTemplate.R
									
									
									
									
									
								
							| @@ -1,75 +1,75 @@ | |||||||
| # scriptTemplate.R | # scriptTemplate.R | ||||||
| # | # | ||||||
| # Purpose: | # Purpose: | ||||||
| # Version: | # Version: | ||||||
| # Date: | # Date: | ||||||
| # Author: | # Author: | ||||||
| # | # | ||||||
| # Input: | # Input: | ||||||
| # Output: | # Output: | ||||||
| # Dependencies: | # Dependencies: | ||||||
| # | # | ||||||
| # ToDo: | # ToDo: | ||||||
| # Notes: | # Notes: | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
| setwd("<your/project/directory>") | setwd("<your/project/directory>") | ||||||
|  |  | ||||||
| # ====  PARAMETERS  ============================================================ | # ====  PARAMETERS  ============================================================ | ||||||
| # Define and explain all parameters. No "magic numbers" in your code below. | # Define and explain all parameters. No "magic numbers" in your code below. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ====  PACKAGES  ============================================================== | # ====  PACKAGES  ============================================================== | ||||||
| # Check that required packages have been installed. Install if needed. | # Check that required packages have been installed. Install if needed. | ||||||
|  |  | ||||||
| if (! requireNamespace("seqinr", quietly=TRUE)) { | if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||||
|   install.packages("seqinr") |   install.packages("seqinr") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = seqinr)       # basic information | #  library(help = seqinr)       # basic information | ||||||
| #  browseVignettes("seqinr")    # available vignettes | #  browseVignettes("seqinr")    # available vignettes | ||||||
| #  data(package = "seqinr")     # available datasets | #  data(package = "seqinr")     # available datasets | ||||||
|  |  | ||||||
| # Note: use package functions with the :: operator - eg. | # Note: use package functions with the :: operator - eg. | ||||||
| # seqinr::aaa("K") | # seqinr::aaa("K") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ====  FUNCTIONS  ============================================================= | # ====  FUNCTIONS  ============================================================= | ||||||
|  |  | ||||||
| # Define functions or source external files | # Define functions or source external files | ||||||
| source("<myUtilityFunctionsScript.R>") | source("<myUtilityFunctionsScript.R>") | ||||||
|  |  | ||||||
| myFunction <- function(a, b=1) { | myFunction <- function(a, b=1) { | ||||||
| 	# Purpose: | 	# Purpose: | ||||||
| 	#     Describe ... | 	#     Describe ... | ||||||
| 	# Parameters: | 	# Parameters: | ||||||
| 	#     a: ... | 	#     a: ... | ||||||
| 	#     b: ... | 	#     b: ... | ||||||
| 	# Value: | 	# Value: | ||||||
| 	#     result: ... | 	#     result: ... | ||||||
|  |  | ||||||
| 	# code ... | 	# code ... | ||||||
|  |  | ||||||
| 	return(result) | 	return(result) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ====  PROCESS  =============================================================== | # ====  PROCESS  =============================================================== | ||||||
| # Enter the step-by-step process of your project here. Strive to write your | # Enter the step-by-step process of your project here. Strive to write your | ||||||
| # code so that you can simply run this entire file and re-create all | # code so that you can simply run this entire file and re-create all | ||||||
| # intermediate results. | # intermediate results. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ====  TESTS  ================================================================= | # ====  TESTS  ================================================================= | ||||||
| # Enter your function tests here... | # Enter your function tests here... | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,30 +1,30 @@ | |||||||
| # ABC-createRefDB.R | # ABC-createRefDB.R | ||||||
| # | # | ||||||
| # Create a reference protein database for Mbp1-like proteins | # Create a reference protein database for Mbp1-like proteins | ||||||
| # | # | ||||||
| # Boris Steipe for ABC learning units | # Boris Steipe for ABC learning units | ||||||
| # | # | ||||||
| # For the species, see: | # For the species, see: | ||||||
| # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi | # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi | ||||||
| # | # | ||||||
| # For the data model, see | # For the data model, see | ||||||
| # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0 | # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0 | ||||||
| # For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R | # For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| myDB <- dbInit() | myDB <- dbInit() | ||||||
|  |  | ||||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json")) | myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json")) | ||||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json")) | myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json")) | ||||||
| myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json")) | myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json")) | ||||||
|  |  | ||||||
| myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json")) | myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json")) | ||||||
|  |  | ||||||
| myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json")) | myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json")) | ||||||
|  |  | ||||||
| myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json")) | myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json")) | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,443 +1,443 @@ | |||||||
| # tocID <- "scripts/ABC-makeMYSPElist.R" | # tocID <- "scripts/ABC-makeMYSPElist.R" | ||||||
| # | # | ||||||
| # Purpose:  Create a list of genome sequenced fungi with protein annotations and | # Purpose:  Create a list of genome sequenced fungi with protein annotations and | ||||||
| #               Mbp1 homologues. | #               Mbp1 homologues. | ||||||
| # | # | ||||||
| # Version: 1.4 | # Version: 1.4 | ||||||
| # | # | ||||||
| # Date:    2016  09  -  2021  09 | # Date:    2016  09  -  2021  09 | ||||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions | # Versions | ||||||
| #          1.4    New retrieval logic | #          1.4    New retrieval logic | ||||||
| #          1.3    Rewrite to change datasource. NCBI has not been updated | #          1.3    Rewrite to change datasource. NCBI has not been updated | ||||||
| #                   since 2012. Use ensembl fungi as initial source. | #                   since 2012. Use ensembl fungi as initial source. | ||||||
| #          1.2    Change from require() to requireNamespace() | #          1.2    Change from require() to requireNamespace() | ||||||
| #          1.1.2  Moved BLAST.R to ./scripts directory | #          1.1.2  Moved BLAST.R to ./scripts directory | ||||||
| #          1.1    Update 2017 | #          1.1    Update 2017 | ||||||
| #          1.0    First code 2016 | #          1.0    First code 2016 | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
| # | # | ||||||
| # DO NOT  source()  THIS FILE! | # DO NOT  source()  THIS FILE! | ||||||
| # | # | ||||||
| # This file is code I provide for your deeper understanding of a process and | # This file is code I provide for your deeper understanding of a process and | ||||||
| # to provide you with useful sample code. It is not actually necessary for | # to provide you with useful sample code. It is not actually necessary for | ||||||
| # you to run this code, but I encourage you to read it carefully and discuss | # you to run this code, but I encourage you to read it carefully and discuss | ||||||
| # if there are parts you don't understand. | # if there are parts you don't understand. | ||||||
| # | # | ||||||
| # Run the commands that interact with the NCBI servers only if you want to | # Run the commands that interact with the NCBI servers only if you want to | ||||||
| # experiment specifically with the code and/or parameters. I have commented out | # experiment specifically with the code and/or parameters. I have commented out | ||||||
| # those parts. If you only want to study the general workflow, just load() | # those parts. If you only want to study the general workflow, just load() | ||||||
| # the respective intermediate results. | # the respective intermediate results. | ||||||
| # | # | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                    Line | #TOC>   Section  Title                                    Line | ||||||
| #TOC> -------------------------------------------------------- | #TOC> -------------------------------------------------------- | ||||||
| #TOC>   1        The strategy                               55 | #TOC>   1        The strategy                               55 | ||||||
| #TOC>   2        PACKAGES AND INITIALIZATIONS               67 | #TOC>   2        PACKAGES AND INITIALIZATIONS               67 | ||||||
| #TOC>   3        ENSEMBL FUNGI                              75 | #TOC>   3        ENSEMBL FUNGI                              75 | ||||||
| #TOC>   3.1        Import                                   78 | #TOC>   3.1        Import                                   78 | ||||||
| #TOC>   4        BLAST SEARCH                              155 | #TOC>   4        BLAST SEARCH                              155 | ||||||
| #TOC>   4.1        find homologous proteins                161 | #TOC>   4.1        find homologous proteins                161 | ||||||
| #TOC>   4.2        Identify species in "hits"              192 | #TOC>   4.2        Identify species in "hits"              192 | ||||||
| #TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282 | #TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282 | ||||||
| #TOC>   6        STUDENT NUMBERS                           375 | #TOC>   6        STUDENT NUMBERS                           375 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  The strategy  ======================================================== | # =    1  The strategy  ======================================================== | ||||||
|  |  | ||||||
| # This script will create a list of "MYSPE" species and save it in an R object | # This script will create a list of "MYSPE" species and save it in an R object | ||||||
| # MYSPEspecies that is stored in the data subdirectory of this project from | # MYSPEspecies that is stored in the data subdirectory of this project from | ||||||
| # where it can be loaded. The strategy is as follows: we download a list of | # where it can be loaded. The strategy is as follows: we download a list of | ||||||
| # annotated fungal genomes from ensembl.fungi. All these are genome-sequenced | # annotated fungal genomes from ensembl.fungi. All these are genome-sequenced | ||||||
| # species that have been annotated. | # species that have been annotated. | ||||||
| # Next we perform a BLAST search, to identify fungal species that have | # Next we perform a BLAST search, to identify fungal species that have | ||||||
| # genes that are homologous to yeast MBP1. | # genes that are homologous to yeast MBP1. | ||||||
| # | # | ||||||
| # ... | # ... | ||||||
|  |  | ||||||
| # =    2  PACKAGES AND INITIALIZATIONS  ======================================== | # =    2  PACKAGES AND INITIALIZATIONS  ======================================== | ||||||
|  |  | ||||||
| # httr provides interfaces to Webservers on the Internet | # httr provides interfaces to Webservers on the Internet | ||||||
| if (! requireNamespace("httr", quietly = TRUE)) { | if (! requireNamespace("httr", quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  ENSEMBL FUNGI  ======================================================= | # =    3  ENSEMBL FUNGI  ======================================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.1  Import  ============================================================ | # ==   3.1  Import  ============================================================ | ||||||
|  |  | ||||||
| # Navigate to https://fungi.ensembl.org and click on the link to the full | # Navigate to https://fungi.ensembl.org and click on the link to the full | ||||||
| # list of all species: https://fungi.ensembl.org/species.html | # list of all species: https://fungi.ensembl.org/species.html | ||||||
| # On the page, click on the spreadsheet symbol top right and choose | # On the page, click on the spreadsheet symbol top right and choose | ||||||
| # "download whole table". The file will be named  "Species.csv", in your | # "download whole table". The file will be named  "Species.csv", in your | ||||||
| # usual downloads folder. Move it to the data folder, and read it. | # usual downloads folder. Move it to the data folder, and read it. | ||||||
|  |  | ||||||
| sDat <- read.csv("./data/Species.csv") | sDat <- read.csv("./data/Species.csv") | ||||||
| str(sDat) | str(sDat) | ||||||
|  |  | ||||||
| # The most obvious way to partition these is according to Classification ... | # The most obvious way to partition these is according to Classification ... | ||||||
| # (poking around a bit in the UniProt taxonomy database shows that the | # (poking around a bit in the UniProt taxonomy database shows that the | ||||||
| #  classification used here is the taxonomic rank of "order"). | #  classification used here is the taxonomic rank of "order"). | ||||||
| # how many classifications do we have? | # how many classifications do we have? | ||||||
| length(unique(sDat$Classification))  # 66 | length(unique(sDat$Classification))  # 66 | ||||||
|  |  | ||||||
| # To have a good set for the class, we should have about 100. | # To have a good set for the class, we should have about 100. | ||||||
| # Let's see for which of these we can find Mbp1 homologues. | # Let's see for which of these we can find Mbp1 homologues. | ||||||
| # First, we'll keep only the colums for name, classification, and taxID, and | # First, we'll keep only the colums for name, classification, and taxID, and | ||||||
| # drop the rest ... | # drop the rest ... | ||||||
| sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")] | sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")] | ||||||
| colnames(sDat) <- c("name", "order", "taxID") | colnames(sDat) <- c("name", "order", "taxID") | ||||||
|  |  | ||||||
| # Next, we make an extra column: genus - the first part of the binomial name. | # Next, we make an extra column: genus - the first part of the binomial name. | ||||||
| # We'll use the gsub() function, and for that we need a "regular expression" | # We'll use the gsub() function, and for that we need a "regular expression" | ||||||
| # that matches to all characters from the first blank to the end of the string: | # that matches to all characters from the first blank to the end of the string: | ||||||
| myPatt <- "\\s.*$"  # one whitespace (\\s) ... | myPatt <- "\\s.*$"  # one whitespace (\\s) ... | ||||||
|                     # followed by any character (.) 0..n times (*) ... |                     # followed by any character (.) 0..n times (*) ... | ||||||
|                     # until the end of the string |                     # until the end of the string | ||||||
|  |  | ||||||
| # using gsub() we substitue all matching characters with the empty string "" - | # using gsub() we substitue all matching characters with the empty string "" - | ||||||
| # this deletes the matching characters | # this deletes the matching characters | ||||||
| # Test this: | # Test this: | ||||||
| gsub(myPatt, "", "Genus")                      # one word: unchanged | gsub(myPatt, "", "Genus")                      # one word: unchanged | ||||||
| gsub(myPatt, "", "gEnus species")              # two words: return only first | gsub(myPatt, "", "gEnus species")              # two words: return only first | ||||||
| gsub(myPatt, "", "geNus species strain 123")   # many words: return only first | gsub(myPatt, "", "geNus species strain 123")   # many words: return only first | ||||||
|  |  | ||||||
| # apply this to the "name" column and add the result as a separate column | # apply this to the "name" column and add the result as a separate column | ||||||
| # called "genus" | # called "genus" | ||||||
| sDat$genus <- gsub(myPatt, "", sDat$name) | sDat$genus <- gsub(myPatt, "", sDat$name) | ||||||
|  |  | ||||||
| # what do we get? | # what do we get? | ||||||
| c(head(unique(sDat$genus)), | c(head(unique(sDat$genus)), | ||||||
|   tail(unique(sDat$genus)))  # inspect the first and last few. Note that there |   tail(unique(sDat$genus)))  # inspect the first and last few. Note that there | ||||||
|                              # is a problem that we have to keep in mind. |                              # is a problem that we have to keep in mind. | ||||||
|                              # (Always inspect your results!) |                              # (Always inspect your results!) | ||||||
| # Drop all rows for which the genus contains special chracters - | # Drop all rows for which the genus contains special chracters - | ||||||
| # like "[Candida]" | # like "[Candida]" | ||||||
| sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ] | sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ] | ||||||
|  |  | ||||||
| length(table(sDat$genus))    # how many genus? | length(table(sDat$genus))    # how many genus? | ||||||
| hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ... | hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ... | ||||||
|                                               # most genus have very few, but |                                               # most genus have very few, but | ||||||
|                                               # some have very many species. |                                               # some have very many species. | ||||||
| sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten... | sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten... | ||||||
|  |  | ||||||
| # We should have at least one species from each taxonomic order, but we can | # We should have at least one species from each taxonomic order, but we can | ||||||
| # add a few genus until we have about 100 validated species. | # add a few genus until we have about 100 validated species. | ||||||
|  |  | ||||||
| # Let's add a column for species, by changing our regular expression a bit, | # Let's add a column for species, by changing our regular expression a bit, | ||||||
| # using ^ (start of string), \\S (NOT a whitespace), | # using ^ (start of string), \\S (NOT a whitespace), | ||||||
| # and + (one or more matches), capturing the match (...), and returning | # and + (one or more matches), capturing the match (...), and returning | ||||||
| # it as the substitution (\\1) ... | # it as the substitution (\\1) ... | ||||||
|  |  | ||||||
| myPatt <- "^(\\S+\\s\\S+)\\s.*$" | myPatt <- "^(\\S+\\s\\S+)\\s.*$" | ||||||
| sDat$species <- gsub(myPatt, "\\1", sDat$name) | sDat$species <- gsub(myPatt, "\\1", sDat$name) | ||||||
|  |  | ||||||
| # And we reorder the columns, just for aesthetics: | # And we reorder the columns, just for aesthetics: | ||||||
| sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")] | sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")] | ||||||
|  |  | ||||||
| # Final check: | # Final check: | ||||||
| any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters | any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters | ||||||
|  |  | ||||||
| # | # | ||||||
| # Now we check which of these have Mbp1 homologues ... | # Now we check which of these have Mbp1 homologues ... | ||||||
|  |  | ||||||
| # =    4  BLAST SEARCH  ======================================================== | # =    4  BLAST SEARCH  ======================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # We run a BLAST search to find all proteins related to yeast Mbp1 in any | # We run a BLAST search to find all proteins related to yeast Mbp1 in any | ||||||
| # fungus. With the results, we'll annotate our sDat table. | # fungus. With the results, we'll annotate our sDat table. | ||||||
|  |  | ||||||
| # ==   4.1  find homologous proteins  ========================================== | # ==   4.1  find homologous proteins  ========================================== | ||||||
| # | # | ||||||
| # Use BLAST to fetch proteins related to Mbp1 and identify the species that | # Use BLAST to fetch proteins related to Mbp1 and identify the species that | ||||||
| # contain them. | # contain them. | ||||||
|  |  | ||||||
| # Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair | # Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair | ||||||
| # amount of error handling involved that is not supported by the API in a | # amount of error handling involved that is not supported by the API in a | ||||||
| # principled way but requires rather ad hoc solutions. The code I threw together | # principled way but requires rather ad hoc solutions. The code I threw together | ||||||
| # to make a BLAST interface (demo-quality, not research-quality) is in the file | # to make a BLAST interface (demo-quality, not research-quality) is in the file | ||||||
| # ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty | # ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty | ||||||
| # standard task of communicating with servers and parsing responses - everyday | # standard task of communicating with servers and parsing responses - everyday | ||||||
| # fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST | # fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST | ||||||
| # parser in currently available packages. | # parser in currently available packages. | ||||||
| # | # | ||||||
| # DON'T use this for BLAST searches unless you have read the NCBI policy | # DON'T use this for BLAST searches unless you have read the NCBI policy | ||||||
| # for automated tasks. If you indicriminately pound on the NCBI's BLAST | # for automated tasks. If you indicriminately pound on the NCBI's BLAST | ||||||
| # server, they will blacklist your IP-address. See: | # server, they will blacklist your IP-address. See: | ||||||
| # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo | # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo | ||||||
| # | # | ||||||
| # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq | # Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq | ||||||
| # BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID | # BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID | ||||||
| #                    db = "refseq_protein",        # database to search in | #                    db = "refseq_protein",        # database to search in | ||||||
| #                    nHits = 3000,                 # 945 hits in 2020 | #                    nHits = 3000,                 # 945 hits in 2020 | ||||||
| #                    E = 0.01,                     # | #                    E = 0.01,                     # | ||||||
| #                    limits = "txid4751[ORGN]")    # = fungi | #                    limits = "txid4751[ORGN]")    # = fungi | ||||||
| # saveRDS(BLASThits, file="data/BLASThits.rds") | # saveRDS(BLASThits, file="data/BLASThits.rds") | ||||||
| # | # | ||||||
| # NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory | # NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory | ||||||
| # | # | ||||||
| BLASThits <- readRDS(file = "data/BLASThits.rds") | BLASThits <- readRDS(file = "data/BLASThits.rds") | ||||||
|  |  | ||||||
| # ==   4.2  Identify species in "hits"  ======================================== | # ==   4.2  Identify species in "hits"  ======================================== | ||||||
|  |  | ||||||
| # This is a very big list that can't be usefully analyzed manually. Here | # This is a very big list that can't be usefully analyzed manually. Here | ||||||
| # we are only interested in the species names that it contains. | # we are only interested in the species names that it contains. | ||||||
|  |  | ||||||
| # How many hits in the list? | # How many hits in the list? | ||||||
| length(BLASThits$hits)      # 1,134 | length(BLASThits$hits)      # 1,134 | ||||||
|  |  | ||||||
| # Let's look at a hit somewhere down the list | # Let's look at a hit somewhere down the list | ||||||
| str(BLASThits$hit[[277]]) | str(BLASThits$hit[[277]]) | ||||||
|  |  | ||||||
| # A fair amount of parsing has gone into the BLAST.R code to prepare the results | # A fair amount of parsing has gone into the BLAST.R code to prepare the results | ||||||
| # in a useful way. The species information is in the $species element of every | # in a useful way. The species information is in the $species element of every | ||||||
| # hit. | # hit. | ||||||
|  |  | ||||||
| # Run a loop to extract all the species names into a vector. We subset ... | # Run a loop to extract all the species names into a vector. We subset ... | ||||||
| # Blasthits$hits                 ... the list of hits, from which we choose ... | # Blasthits$hits                 ... the list of hits, from which we choose ... | ||||||
| # Blasthits$hits[[i]]            ... the i-th hit, and get ... | # Blasthits$hits[[i]]            ... the i-th hit, and get ... | ||||||
| # Blasthits$hits[[i]]$species    ... the species element from that. | # Blasthits$hits[[i]]$species    ... the species element from that. | ||||||
| # Subsetting FTW. | # Subsetting FTW. | ||||||
|  |  | ||||||
| BLASTspecies <- character() | BLASTspecies <- character() | ||||||
| for (i in seq_along(BLASThits$hits)) { | for (i in seq_along(BLASThits$hits)) { | ||||||
|     BLASTspecies[i] <- BLASThits$hits[[i]]$species |     BLASTspecies[i] <- BLASThits$hits[[i]]$species | ||||||
| } | } | ||||||
|  |  | ||||||
| # You can confirm that BLASTspecies has the expected size. | # You can confirm that BLASTspecies has the expected size. | ||||||
| length(BLASTspecies) | length(BLASTspecies) | ||||||
|  |  | ||||||
| # if we delete some of these later on, we still want to remember which hit | # if we delete some of these later on, we still want to remember which hit | ||||||
| # they came from. Thus we name() the elements with their index, which is the | # they came from. Thus we name() the elements with their index, which is the | ||||||
| # same as the index of the hit in BLASThits | # same as the index of the hit in BLASThits | ||||||
| names(BLASTspecies) <- 1:length(BLASTspecies) | names(BLASTspecies) <- 1:length(BLASTspecies) | ||||||
|  |  | ||||||
|  |  | ||||||
| # let's plot the distribution of E-values | # let's plot the distribution of E-values | ||||||
| eVals <- numeric() | eVals <- numeric() | ||||||
| for (i in seq_along(BLASThits$hits)) { | for (i in seq_along(BLASThits$hits)) { | ||||||
|   eVals[i] <- BLASThits$hits[[i]]$E |   eVals[i] <- BLASThits$hits[[i]]$E | ||||||
| } | } | ||||||
| range(eVals) | range(eVals) | ||||||
| sum(eVals == 0) | sum(eVals == 0) | ||||||
|  |  | ||||||
| # let's plot the log of all values > 0 to see how they are distributed | # let's plot the log of all values > 0 to see how they are distributed | ||||||
| # plotting only one vectyor of numbers plots their index as x, and | # plotting only one vectyor of numbers plots their index as x, and | ||||||
| # their value as y ... | # their value as y ... | ||||||
| plot(log(eVals[eVals > 0]), col = "#CC0000") | plot(log(eVals[eVals > 0]), col = "#CC0000") | ||||||
|  |  | ||||||
| # This is very informative: I would suspect that the first ten or so are | # This is very informative: I would suspect that the first ten or so are | ||||||
| # virtually identical to the yeast protein, then we have about 800 hits with | # virtually identical to the yeast protein, then we have about 800 hits with | ||||||
| # decreasing similarity, and then about 200 more that may actually be false | # decreasing similarity, and then about 200 more that may actually be false | ||||||
| # positives. Also - we plotted them by index, that means the table is SORTED: | # positives. Also - we plotted them by index, that means the table is SORTED: | ||||||
| # Lower E-values strictly come before higher E-values. | # Lower E-values strictly come before higher E-values. | ||||||
|  |  | ||||||
| # Again, some species appear more than once, e.g. ... | # Again, some species appear more than once, e.g. ... | ||||||
| sum(BLASTspecies == "Saccharomyces cerevisiae") | sum(BLASTspecies == "Saccharomyces cerevisiae") | ||||||
|  |  | ||||||
| # ... corresponding to the five homologous gene sequences (paralogues) of yeast. | # ... corresponding to the five homologous gene sequences (paralogues) of yeast. | ||||||
|  |  | ||||||
| # Therefore we remove duplicates. Removing duplicates will leave the FIRST | # Therefore we remove duplicates. Removing duplicates will leave the FIRST | ||||||
| # in a list alone, and only remove the SUBSEQUENT ones. Which means, from each | # in a list alone, and only remove the SUBSEQUENT ones. Which means, from each | ||||||
| # species, we will retain only the protein that has the highest similarity | # species, we will retain only the protein that has the highest similarity | ||||||
| # to yeast Mbp1, not any of its more distant paralogues. | # to yeast Mbp1, not any of its more distant paralogues. | ||||||
| sel <- ! duplicated(BLASTspecies) | sel <- ! duplicated(BLASTspecies) | ||||||
| BLASTspecies <- BLASTspecies[sel] | BLASTspecies <- BLASTspecies[sel] | ||||||
|  |  | ||||||
| length(BLASTspecies) | length(BLASTspecies) | ||||||
| # i.e. we got rid of about two thirds of the hits. | # i.e. we got rid of about two thirds of the hits. | ||||||
| tail(BLASTspecies)  # see how the names are useful! | tail(BLASTspecies)  # see how the names are useful! | ||||||
|                     # again - there are some special characters ... |                     # again - there are some special characters ... | ||||||
|                     # what are they? |                     # what are they? | ||||||
| BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)] | BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)] | ||||||
|  |  | ||||||
| # remove the brackets ... | # remove the brackets ... | ||||||
| BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies) | BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies) | ||||||
| # drop any new duplicates ... | # drop any new duplicates ... | ||||||
| BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)] | BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)] | ||||||
|  |  | ||||||
| # check the number again: | # check the number again: | ||||||
| length(BLASTspecies) | length(BLASTspecies) | ||||||
| # Think a bit about this: what may be the biological reason to find that | # Think a bit about this: what may be the biological reason to find that | ||||||
| # on average, in 388 fungi across the entire phylogenetic tree, we have | # on average, in 388 fungi across the entire phylogenetic tree, we have | ||||||
| # three sequences that are homologous to yeast Mbp1? | # three sequences that are homologous to yeast Mbp1? | ||||||
|  |  | ||||||
| # Let's look at the distribution of E-values in this selection (Subsetting FTW): | # Let's look at the distribution of E-values in this selection (Subsetting FTW): | ||||||
| # we plot all values that are TRUE in the vector "sel" that we created above, | # we plot all values that are TRUE in the vector "sel" that we created above, | ||||||
| # AND greater than 0 | # AND greater than 0 | ||||||
| plot(log(eVals[sel & eVals > 0]), col = "#00CC00") | plot(log(eVals[sel & eVals > 0]), col = "#00CC00") | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    5  MERGE ENSEMBL AND BLAST RESULTS  ===================================== | # =    5  MERGE ENSEMBL AND BLAST RESULTS  ===================================== | ||||||
|  |  | ||||||
| # Next we add the blast result to our sDat dataframe. We'll store the index, | # Next we add the blast result to our sDat dataframe. We'll store the index, | ||||||
| # the E-value, and the Query-bounds from which we can estimate which domains | # the E-value, and the Query-bounds from which we can estimate which domains | ||||||
| # of Mbp1 are actually covered by the hit. (True orthologues MUST align with | # of Mbp1 are actually covered by the hit. (True orthologues MUST align with | ||||||
| # Mbp1's N-terminal APSES domain.) | # Mbp1's N-terminal APSES domain.) | ||||||
| # | # | ||||||
| # First we pull the hits we wanted from the BLASTspecies: | # First we pull the hits we wanted from the BLASTspecies: | ||||||
| iHits <- as.numeric(names(BLASTspecies)) | iHits <- as.numeric(names(BLASTspecies)) | ||||||
| length(iHits)     # one index for each TRUE in sel | length(iHits)     # one index for each TRUE in sel | ||||||
|  |  | ||||||
| # add columns to sDat | # add columns to sDat | ||||||
| l <- nrow(sDat) | l <- nrow(sDat) | ||||||
| sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results | sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results | ||||||
| sDat$eVal   <- numeric(l)  # E-value of the hit | sDat$eVal   <- numeric(l)  # E-value of the hit | ||||||
| sDat$lAli   <- numeric(l)  # length of the aligned region | sDat$lAli   <- numeric(l)  # length of the aligned region | ||||||
|  |  | ||||||
| # extract and merge | # extract and merge | ||||||
| for (iHit in iHits) { | for (iHit in iHits) { | ||||||
|   thisSp <- BLASThits$hits[[iHit]]$species |   thisSp <- BLASThits$hits[[iHit]]$species | ||||||
|   sel <- sDat$species == thisSp |   sel <- sDat$species == thisSp | ||||||
|  |  | ||||||
|   sDat$iHit[sel]   <- iHit |   sDat$iHit[sel]   <- iHit | ||||||
|   sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E |   sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E | ||||||
|   sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli |   sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli | ||||||
| } | } | ||||||
|  |  | ||||||
| # Are all reference species accounted for? | # Are all reference species accounted for? | ||||||
| selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit | selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit | ||||||
| REFspecies %in% sDat$species[selA]     # yes, all there | REFspecies %in% sDat$species[selA]     # yes, all there | ||||||
|  |  | ||||||
| selB <- sDat$species %in% REFspecies   # all rows which have one of REF species | selB <- sDat$species %in% REFspecies   # all rows which have one of REF species | ||||||
|  |  | ||||||
| sum(selA & selB)   # How many rows? | sum(selA & selB)   # How many rows? | ||||||
|  |  | ||||||
| # sDat of course includes all duplicates. Some may be multiply sequenced, some | # sDat of course includes all duplicates. Some may be multiply sequenced, some | ||||||
| # may be different strains. We'll use the same strategy as before and keep | # may be different strains. We'll use the same strategy as before and keep | ||||||
| # only the best hit: order the rows by E-value, then drop all rows which | # only the best hit: order the rows by E-value, then drop all rows which | ||||||
| # are duplicated. | # are duplicated. | ||||||
|  |  | ||||||
|  |  | ||||||
| # drop all rows without BLAST hits ... | # drop all rows without BLAST hits ... | ||||||
| sDat <- sDat[ ! (sDat$iHit == 0) , ] | sDat <- sDat[ ! (sDat$iHit == 0) , ] | ||||||
|  |  | ||||||
| # order sDat by E-value ... | # order sDat by E-value ... | ||||||
| sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ] | sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ] | ||||||
|  |  | ||||||
| # drop all rows with duplicated species ... | # drop all rows with duplicated species ... | ||||||
| sDat <- sDat[ ! duplicated(sDat$species) , ] | sDat <- sDat[ ! duplicated(sDat$species) , ] | ||||||
|  |  | ||||||
| # Lets look at the E-values ... | # Lets look at the E-values ... | ||||||
| plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00") | plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00") | ||||||
|  |  | ||||||
| # and alignment lengths ... | # and alignment lengths ... | ||||||
| plot(sDat$lAli, col = "#00DDAA") | plot(sDat$lAli, col = "#00DDAA") | ||||||
|  |  | ||||||
| # How many ... | # How many ... | ||||||
| length(unique(sDat$name)) | length(unique(sDat$name)) | ||||||
| length(unique(sDat$species)) | length(unique(sDat$species)) | ||||||
| length(unique(sDat$genus)) | length(unique(sDat$genus)) | ||||||
| length(unique(sDat$order)) | length(unique(sDat$order)) | ||||||
|  |  | ||||||
| # I need an extra species for admin purposes later on ... | # I need an extra species for admin purposes later on ... | ||||||
| sel <- grep("Sporothrix schenckii", sDat$species) | sel <- grep("Sporothrix schenckii", sDat$species) | ||||||
| SPOSCdat <- sDat[sel, ] | SPOSCdat <- sDat[sel, ] | ||||||
| sDat <- sDat[-sel, ] | sDat <- sDat[-sel, ] | ||||||
|  |  | ||||||
| # To get the final dataset, we remove the reference species with their | # To get the final dataset, we remove the reference species with their | ||||||
| # entire orders ... | # entire orders ... | ||||||
| REForders <- unique(sDat$order[sDat$species %in% REFspecies]) | REForders <- unique(sDat$order[sDat$species %in% REFspecies]) | ||||||
| sel <- sDat$order %in% REForders | sel <- sDat$order %in% REForders | ||||||
| REFdat <- sDat[sel , ] | REFdat <- sDat[sel , ] | ||||||
| sDat   <- sDat[ ! sel , ] | sDat   <- sDat[ ! sel , ] | ||||||
|  |  | ||||||
| # REFdat should now contain only the REFspecies ... | # REFdat should now contain only the REFspecies ... | ||||||
| ( REFdat <- REFdat[REFdat$species %in% REFspecies , ] ) | ( REFdat <- REFdat[REFdat$species %in% REFspecies , ] ) | ||||||
|  |  | ||||||
| # ... but all of them | # ... but all of them | ||||||
| sum(REFspecies %in% REFdat$species) | sum(REFspecies %in% REFdat$species) | ||||||
|  |  | ||||||
| # ... and we have enough left in sDat to prune sDat to unique genus | # ... and we have enough left in sDat to prune sDat to unique genus | ||||||
| sDat <- sDat[ ! duplicated(sDat$genus) , ] | sDat <- sDat[ ! duplicated(sDat$genus) , ] | ||||||
| nrow(sDat)   # 84 | nrow(sDat)   # 84 | ||||||
|  |  | ||||||
| # I add back "Sporothrix schenckii" ... | # I add back "Sporothrix schenckii" ... | ||||||
| sDat <- rbind(SPOSCdat, sDat) | sDat <- rbind(SPOSCdat, sDat) | ||||||
|  |  | ||||||
| # ... and save for future use. | # ... and save for future use. | ||||||
| # saveRDS(sDat, file = "data/sDat.rds") | # saveRDS(sDat, file = "data/sDat.rds") | ||||||
| # saveRDS(REFdat, file = "data/REFdat.rds") | # saveRDS(REFdat, file = "data/REFdat.rds") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    6  STUDENT NUMBERS  ===================================================== | # =    6  STUDENT NUMBERS  ===================================================== | ||||||
| # | # | ||||||
| # An asymmetric function to retrieve a MYSPE species | # An asymmetric function to retrieve a MYSPE species | ||||||
| # | # | ||||||
| sDat <- readRDS(file = "data/sDat.rds") | sDat <- readRDS(file = "data/sDat.rds") | ||||||
|  |  | ||||||
| students <- read.csv("../BCH441-2021-students.csv") | students <- read.csv("../BCH441-2021-students.csv") | ||||||
| sN <- students$Integration.ID | sN <- students$Integration.ID | ||||||
| sN <- sN[! is.na(sN)] | sN <- sN[! is.na(sN)] | ||||||
| sN <- as.character(sN) | sN <- as.character(sN) | ||||||
| sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii" | sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii" | ||||||
|  |  | ||||||
| set.seed(112358) | set.seed(112358) | ||||||
| theseSpecies <- sDat[sample(1:nrow(sDat)), ] | theseSpecies <- sDat[sample(1:nrow(sDat)), ] | ||||||
| all(sort(theseSpecies$name) == sort(sDat$name)) | all(sort(theseSpecies$name) == sort(sDat$name)) | ||||||
| nrow((theseSpecies)) | nrow((theseSpecies)) | ||||||
| (iX <- grep("Sporothrix schenckii", theseSpecies$name)) | (iX <- grep("Sporothrix schenckii", theseSpecies$name)) | ||||||
| theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ]) | theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ]) | ||||||
| rndMin <-  992000000 | rndMin <-  992000000 | ||||||
| rndMax <- 1020000000 | rndMax <- 1020000000 | ||||||
| N <- 10000 | N <- 10000 | ||||||
| keys <- as.character(sample(rndMin:rndMax, N + 1000)) | keys <- as.character(sample(rndMin:rndMax, N + 1000)) | ||||||
| keys <- keys[! (keys %in% sN)] | keys <- keys[! (keys %in% sN)] | ||||||
| keys <- keys[1:N] | keys <- keys[1:N] | ||||||
| keys[1:length(sN)] <- sN | keys[1:length(sN)] <- sN | ||||||
|  |  | ||||||
| nRep <- floor(N/nrow(theseSpecies)) | nRep <- floor(N/nrow(theseSpecies)) | ||||||
| MYSPEdat <- theseSpecies | MYSPEdat <- theseSpecies | ||||||
| for(i in 1:nRep) { | for(i in 1:nRep) { | ||||||
|   MYSPEdat <- rbind(MYSPEdat, theseSpecies) |   MYSPEdat <- rbind(MYSPEdat, theseSpecies) | ||||||
| } | } | ||||||
| MYSPEdat <- MYSPEdat[1:N, ] | MYSPEdat <- MYSPEdat[1:N, ] | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5") |   rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5") | ||||||
| } | } | ||||||
| set.seed(NULL) | set.seed(NULL) | ||||||
| MYSPEdat <- MYSPEdat[sample(1:N), ] | MYSPEdat <- MYSPEdat[sample(1:N), ] | ||||||
|  |  | ||||||
| # saveRDS(MYSPEdat, file = "data/MYSPEdat.rds") | # saveRDS(MYSPEdat, file = "data/MYSPEdat.rds") | ||||||
|  |  | ||||||
| # === validate | # === validate | ||||||
| x <- character() | x <- character() | ||||||
| for (n in sN) { | for (n in sN) { | ||||||
|   sp <- getMYSPE(n) |   sp <- getMYSPE(n) | ||||||
|   if (length(sp) != 1) { |   if (length(sp) != 1) { | ||||||
|     stop(print(as.character(n))) |     stop(print(as.character(n))) | ||||||
|   } else { |   } else { | ||||||
|     x <- c(x, sp) |     x <- c(x, sp) | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| # === species for late-comers | # === species for late-comers | ||||||
| y <- unique(MYSPEdat$species) | y <- unique(MYSPEdat$species) | ||||||
| print(y[!(y %in% x)]) | print(y[!(y %in% x)]) | ||||||
|  |  | ||||||
|  |  | ||||||
| # === validate | # === validate | ||||||
| l <- length(sN) | l <- length(sN) | ||||||
| sp <- character(l) | sp <- character(l) | ||||||
| for(i in 1:l) { | for(i in 1:l) { | ||||||
|   sp[i] <- getMYSPE(sN[i]) |   sp[i] <- getMYSPE(sN[i]) | ||||||
| } | } | ||||||
| any(duplicated(sp)) | any(duplicated(sp)) | ||||||
| length(unique(sp)) | length(unique(sp)) | ||||||
| which(! sDat$species %in% sp)  # these can be assigned to late-comers | which(! sDat$species %in% sp)  # these can be assigned to late-comers | ||||||
|  |  | ||||||
| # Done. | # Done. | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,168 +1,168 @@ | |||||||
| # tocID <- "scripts/ABC-makeSTRINGedges.R" | # tocID <- "scripts/ABC-makeSTRINGedges.R" | ||||||
| # | # | ||||||
| # Create a subnetwork of high-confidence human STRING edges. | # Create a subnetwork of high-confidence human STRING edges. | ||||||
| # | # | ||||||
| # Notes: | # Notes: | ||||||
| # | # | ||||||
| #      The large source- datafile is NOT posted to github. If you want to | #      The large source- datafile is NOT posted to github. If you want to | ||||||
| #      experiment with the original data, download it and place it into your | #      experiment with the original data, download it and place it into your | ||||||
| #      local  ./data  directory. | #      local  ./data  directory. | ||||||
| # | # | ||||||
| #      STRING data source: | #      STRING data source: | ||||||
| #        Download page: | #        Download page: | ||||||
| # https://string-db.org/cgi/download.pl?species_text=Homo+sapiens | # https://string-db.org/cgi/download.pl?species_text=Homo+sapiens | ||||||
| #        Data: (127.6 Mb) | #        Data: (127.6 Mb) | ||||||
| # https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz | # https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.0 | ||||||
| # | # | ||||||
| # Date:     2020-09 | # Date:     2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.0    Rewrite | #           1.0    Rewrite | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                             Line | #TOC>   Section  Title                             Line | ||||||
| #TOC> ------------------------------------------------- | #TOC> ------------------------------------------------- | ||||||
| #TOC>   1        Initialize                          44 | #TOC>   1        Initialize                          44 | ||||||
| #TOC>   2        Read STRING Data                    51 | #TOC>   2        Read STRING Data                    51 | ||||||
| #TOC>   3        Define cutoff and subset            63 | #TOC>   3        Define cutoff and subset            63 | ||||||
| #TOC>   4        Drop  duplicates                   103 | #TOC>   4        Drop  duplicates                   103 | ||||||
| #TOC>   5        Simple statistics                  127 | #TOC>   5        Simple statistics                  127 | ||||||
| #TOC>   6        Write to file                      160 | #TOC>   6        Write to file                      160 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Initialize  ========================================================== | # =    1  Initialize  ========================================================== | ||||||
|  |  | ||||||
| if (! requireNamespace("readr", quietly = TRUE)) { | if (! requireNamespace("readr", quietly = TRUE)) { | ||||||
|   install.packages("readr") |   install.packages("readr") | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Read STRING Data  ==================================================== | # =    2  Read STRING Data  ==================================================== | ||||||
|  |  | ||||||
| # Read STRING Data (needs to be downloaded from database, see URL in Notes) | # Read STRING Data (needs to be downloaded from database, see URL in Notes) | ||||||
| # The .gz compressed version is 127.6MB, the uncompressed version is probably | # The .gz compressed version is 127.6MB, the uncompressed version is probably | ||||||
| # 848 Mb. Fortunately readr:: can read from compressed | # 848 Mb. Fortunately readr:: can read from compressed | ||||||
| # files, and does so automatically, based on the file extension. | # files, and does so automatically, based on the file extension. | ||||||
| ( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") ) | ( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") ) | ||||||
| STR <- readr::read_delim(fn, delim = " ") | STR <- readr::read_delim(fn, delim = " ") | ||||||
| nrow(STR)  #  11,759,454 rows | nrow(STR)  #  11,759,454 rows | ||||||
| head(STR) | head(STR) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Define cutoff and subset  ============================================ | # =    3  Define cutoff and subset  ============================================ | ||||||
|  |  | ||||||
| # approximate distribution of combined_score | # approximate distribution of combined_score | ||||||
| hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF") | hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF") | ||||||
|  |  | ||||||
| # Let's table the counts >= 850 and plot them for better resolution. | # Let's table the counts >= 850 and plot them for better resolution. | ||||||
|  |  | ||||||
| myTb <- table(STR$combined_score[STR$combined_score >= 850]) | myTb <- table(STR$combined_score[STR$combined_score >= 850]) | ||||||
| is.unsorted(as.integer(names(myTb)))  # Good - they are all in order | is.unsorted(as.integer(names(myTb)))  # Good - they are all in order | ||||||
|  |  | ||||||
| plot(myTb, type = "b", cex = 0.5, col = "#BB0000") | plot(myTb, type = "b", cex = 0.5, col = "#BB0000") | ||||||
| myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that | myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that | ||||||
|                          # frequently assigns a combined score of 0.900 |                          # frequently assigns a combined score of 0.900 | ||||||
|  |  | ||||||
| # Let's plot these counts as cumulative sums, in reverse order, scaled | # Let's plot these counts as cumulative sums, in reverse order, scaled | ||||||
| # as combined scores. | # as combined scores. | ||||||
| myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing | myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing | ||||||
| plot(myX, | plot(myX, | ||||||
|      cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing |      cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing | ||||||
|      xlim = c(1.0, 0.85),            # reverse x-axis |      xlim = c(1.0, 0.85),            # reverse x-axis | ||||||
|      type = "l", |      type = "l", | ||||||
|      main = "STRING interactions for 9606 (top 600,000)", |      main = "STRING interactions for 9606 (top 600,000)", | ||||||
|      xlab = "combined_score", |      xlab = "combined_score", | ||||||
|      ylab = "cumulative counts", |      ylab = "cumulative counts", | ||||||
|      col = "#CC0000") |      col = "#CC0000") | ||||||
| abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF") | abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF") | ||||||
|  |  | ||||||
| # What's the cutoff for 100,000 edges? | # What's the cutoff for 100,000 edges? | ||||||
| which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964 | which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964 | ||||||
|  |  | ||||||
| # confirm | # confirm | ||||||
| sum(STR$combined_score >= 964) # 101,348 | sum(STR$combined_score >= 964) # 101,348 | ||||||
| abline(v = 0.964, lwd = 0.5, col = "#DDDDFF") | abline(v = 0.964, lwd = 0.5, col = "#DDDDFF") | ||||||
|  |  | ||||||
| # subset the table, and use only the protein IDs and the combined_score | # subset the table, and use only the protein IDs and the combined_score | ||||||
| STR <- STR[STR$combined_score >= 964, | STR <- STR[STR$combined_score >= 964, | ||||||
|             c("protein1", "protein2", "combined_score")] |             c("protein1", "protein2", "combined_score")] | ||||||
| colnames(STR) <- c("a", "b", "score") | colnames(STR) <- c("a", "b", "score") | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Drop  duplicates  ==================================================== | # =    4  Drop  duplicates  ==================================================== | ||||||
|  |  | ||||||
| # identify duplicate interactions by creating keys in a defined alphabetical | # identify duplicate interactions by creating keys in a defined alphabetical | ||||||
| # sort order, then checking for  duplicated(). | # sort order, then checking for  duplicated(). | ||||||
| # e.g  if we have (X:U, U:X), we change U:X to X:U and now find that | # e.g  if we have (X:U, U:X), we change U:X to X:U and now find that | ||||||
| # (X:U, X:U) has a duplicate. | # (X:U, X:U) has a duplicate. | ||||||
|  |  | ||||||
| AB <- STR$a < STR$b        # logical vector: genes we need to swap | AB <- STR$a < STR$b        # logical vector: genes we need to swap | ||||||
| tmp <- STR$b               # copy column b | tmp <- STR$b               # copy column b | ||||||
| STR$b[AB] <- STR$a[AB]     # copy a's into b | STR$b[AB] <- STR$a[AB]     # copy a's into b | ||||||
| STR$a[AB] <- tmp[AB]       # copy tmp's into a | STR$a[AB] <- tmp[AB]       # copy tmp's into a | ||||||
| all(STR$a >= STR$b)        # confirm: TRUE | all(STR$a >= STR$b)        # confirm: TRUE | ||||||
|  |  | ||||||
| # now, make combined keys, like this: | # now, make combined keys, like this: | ||||||
| paste0(STR$a[1:10], ":", STR$b[1:10]) | paste0(STR$a[1:10], ":", STR$b[1:10]) | ||||||
|  |  | ||||||
| tmp <- paste0(STR$a, ":", STR$b) | tmp <- paste0(STR$a, ":", STR$b) | ||||||
| sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports | sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports | ||||||
|                      # both a:b and b:a ! |                      # both a:b and b:a ! | ||||||
|  |  | ||||||
| # drop all duplicated interactions from tmp | # drop all duplicated interactions from tmp | ||||||
| STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain | STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    5  Simple statistics  =================================================== | # =    5  Simple statistics  =================================================== | ||||||
|  |  | ||||||
| # how many unique genes? | # how many unique genes? | ||||||
| length(unique(c(STR$a, STR$b)))   # 8,445 | length(unique(c(STR$a, STR$b)))   # 8,445 | ||||||
|  |  | ||||||
| # how many self-edges? | # how many self-edges? | ||||||
| sum(STR$a == STR$b)  # none | sum(STR$a == STR$b)  # none | ||||||
|  |  | ||||||
| # log(rank) / log(frequency) | # log(rank) / log(frequency) | ||||||
| myTbl <- table(c(STR$a, STR$b)) | myTbl <- table(c(STR$a, STR$b)) | ||||||
| myTbl <- myTbl[order(myTbl, decreasing = TRUE)] | myTbl <- myTbl[order(myTbl, decreasing = TRUE)] | ||||||
|  |  | ||||||
| hist(myTbl, breaks = 40, col = "#FFEEBB") | hist(myTbl, breaks = 40, col = "#FFEEBB") | ||||||
|  |  | ||||||
| # number of singletons | # number of singletons | ||||||
| sum(myTbl == 1) # almost a quarter | sum(myTbl == 1) # almost a quarter | ||||||
|  |  | ||||||
| # maximum? | # maximum? | ||||||
| myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465 | myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465 | ||||||
|                                    # Google: CDC5L |                                    # Google: CDC5L | ||||||
|  |  | ||||||
| # Zipf-plot | # Zipf-plot | ||||||
| plot(log(1:length(myTbl)), log(as.numeric(myTbl)), | plot(log(1:length(myTbl)), log(as.numeric(myTbl)), | ||||||
|      type = "b", cex = 0.7, |      type = "b", cex = 0.7, | ||||||
|      main = "STRINGedges - degrees", |      main = "STRINGedges - degrees", | ||||||
|      xlab = "log(rank)", |      xlab = "log(rank)", | ||||||
|      ylab = "log(frequency)", |      ylab = "log(frequency)", | ||||||
|      col = "#FFBB88") |      col = "#FFBB88") | ||||||
|  |  | ||||||
| sprintf("Average number of interactions: %5.2f", | sprintf("Average number of interactions: %5.2f", | ||||||
|          nrow(STR) / length(unique(c(STR$a, STR$b)))) |          nrow(STR) / length(unique(c(STR$a, STR$b)))) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    6  Write to file  ======================================================= | # =    6  Write to file  ======================================================= | ||||||
|  |  | ||||||
| saveRDS(STR, file = "./data/STRINGedges.rds") | saveRDS(STR, file = "./data/STRINGedges.rds") | ||||||
|  |  | ||||||
| # STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the | # STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the | ||||||
|                                                     # object when needed |                                                     # object when needed | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,167 +1,167 @@ | |||||||
| # tocID <- "scripts/ABC-makeScCCnet.R" | # tocID <- "scripts/ABC-makeScCCnet.R" | ||||||
| # | # | ||||||
| # Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle" | # Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle" | ||||||
| # GOSlim annotation. | # GOSlim annotation. | ||||||
| # | # | ||||||
| # Boris Steipe for ABC learning units | # Boris Steipe for ABC learning units | ||||||
| # | # | ||||||
| # Notes: | # Notes: | ||||||
| # | # | ||||||
| #      The large source- datafiles are NOT posted to github. If you want to | #      The large source- datafiles are NOT posted to github. If you want to | ||||||
| #      experiment with your own code, download them and place them into your | #      experiment with your own code, download them and place them into your | ||||||
| #      local  ./data  directory. | #      local  ./data  directory. | ||||||
| # | # | ||||||
| #      STRING data source: | #      STRING data source: | ||||||
| #        Download page: | #        Download page: | ||||||
| # https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae | # https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae | ||||||
| #        Data: (20.1 mb) | #        Data: (20.1 mb) | ||||||
| # https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz | # https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz | ||||||
| # | # | ||||||
| #      GOSlim data source: (Note: this has moved from GO to SGD) | #      GOSlim data source: (Note: this has moved from GO to SGD) | ||||||
| #        Info page: https://www.yeastgenome.org/downloads | #        Info page: https://www.yeastgenome.org/downloads | ||||||
| #        Info page: http://sgd-archive.yeastgenome.org/curation/literature/ | #        Info page: http://sgd-archive.yeastgenome.org/curation/literature/ | ||||||
| #        Data: (3 mb) | #        Data: (3 mb) | ||||||
| # http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab | # http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab | ||||||
| # | # | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017-10  -  2020-09 | # Date:     2017-10  -  2020-09 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #           1.2    2020 Update. GO Slim Yeast mow at SGD | #           1.2    2020 Update. GO Slim Yeast mow at SGD | ||||||
| #           1.1    Change from require() to requireNamespace(), | #           1.1    Change from require() to requireNamespace(), | ||||||
| #                      use <package>::<function>() idiom throughout | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0    First code copied from 2016 material. | #           1.0    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
| # SRCDIR <- "./instructor" | # SRCDIR <- "./instructor" | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                           Line | #TOC>   Section  Title                                           Line | ||||||
| #TOC> --------------------------------------------------------------- | #TOC> --------------------------------------------------------------- | ||||||
| #TOC>   1        INITIALIZE                                        58 | #TOC>   1        INITIALIZE                                        58 | ||||||
| #TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66 | #TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66 | ||||||
| #TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96 | #TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96 | ||||||
| #TOC>   3.1        Intersect interactions and annotations         122 | #TOC>   3.1        Intersect interactions and annotations         122 | ||||||
| #TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128 | #TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  INITIALIZE  ========================================================== | # =    1  INITIALIZE  ========================================================== | ||||||
|  |  | ||||||
| SRCDIR <- "./data" | SRCDIR <- "./data" | ||||||
| if (! requireNamespace("readr", quietly = TRUE)) { | if (! requireNamespace("readr", quietly = TRUE)) { | ||||||
|   install.packages("readr") |   install.packages("readr") | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  STRING FUNCTIONAL INTERACTION DATA  ================================== | # =    2  STRING FUNCTIONAL INTERACTION DATA  ================================== | ||||||
|  |  | ||||||
| # Read STRING Data (needs to be downloaded from database, see URL in Notes) | # Read STRING Data (needs to be downloaded from database, see URL in Notes) | ||||||
| # The .gz compressed version is 20MB, the uncompressed versioj is 110MB - | # The .gz compressed version is 20MB, the uncompressed versioj is 110MB - | ||||||
| # really not necessary to uncompress since readr:: can read from compressed | # really not necessary to uncompress since readr:: can read from compressed | ||||||
| # files, and does so automatically, based on the file extension. | # files, and does so automatically, based on the file extension. | ||||||
| ( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") ) | ( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") ) | ||||||
| STR <- readr::read_delim(fn, delim = " ") | STR <- readr::read_delim(fn, delim = " ") | ||||||
|  |  | ||||||
| # Subset only IDs and combined_score column | # Subset only IDs and combined_score column | ||||||
| STR <- STR[ , c("protein1", "protein2", "combined_score")] | STR <- STR[ , c("protein1", "protein2", "combined_score")] | ||||||
|  |  | ||||||
| # head(STR) | # head(STR) | ||||||
| # sum(STR$combined_score > 909)  # 100270 edges | # sum(STR$combined_score > 909)  # 100270 edges | ||||||
| # subset for 100,000 highest confidence edges | # subset for 100,000 highest confidence edges | ||||||
| STR <- STR[(STR$combined_score > 909), ] | STR <- STR[(STR$combined_score > 909), ] | ||||||
| head(STR) | head(STR) | ||||||
|  |  | ||||||
| # IDs are formatted like 4932.YAL005C ... drop the "4932." prefix | # IDs are formatted like 4932.YAL005C ... drop the "4932." prefix | ||||||
| STR$protein1 <- gsub("^4932\\.", "", STR$protein1) | STR$protein1 <- gsub("^4932\\.", "", STR$protein1) | ||||||
| STR$protein2 <- gsub("^4932\\.", "", STR$protein2) | STR$protein2 <- gsub("^4932\\.", "", STR$protein2) | ||||||
| head(STR) | head(STR) | ||||||
|  |  | ||||||
| # get a vector of gene names in this list | # get a vector of gene names in this list | ||||||
| myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene | myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene | ||||||
|                                                       # names |                                                       # names | ||||||
| length(myIntxGenes) | length(myIntxGenes) | ||||||
| sample(myIntxGenes, 10)  # choose 10 at random (sanity check) | sample(myIntxGenes, 10)  # choose 10 at random (sanity check) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  GOSlim FUNCTIONAL ANNOTATIONS  ======================================= | # =    3  GOSlim FUNCTIONAL ANNOTATIONS  ======================================= | ||||||
| # | # | ||||||
| # Read GOSlim data  (needs to be downloaded from database, see URL in Notes) | # Read GOSlim data  (needs to be downloaded from database, see URL in Notes) | ||||||
| ( fn <- file.path(SRCDIR, "go_slim_mapping.tab") ) | ( fn <- file.path(SRCDIR, "go_slim_mapping.tab") ) | ||||||
|  |  | ||||||
| Gsl <- readr::read_tsv(fn, | Gsl <- readr::read_tsv(fn, | ||||||
|                        col_names = c("ID", |                        col_names = c("ID", | ||||||
|                                      "name", |                                      "name", | ||||||
|                                      "SGDId", |                                      "SGDId", | ||||||
|                                      "Ontology", |                                      "Ontology", | ||||||
|                                      "termName", |                                      "termName", | ||||||
|                                      "termID", |                                      "termID", | ||||||
|                                      "status")) |                                      "status")) | ||||||
|  |  | ||||||
| head(Gsl) | head(Gsl) | ||||||
|  |  | ||||||
| # What cell cycle names does it contain? | # What cell cycle names does it contain? | ||||||
| myGslTermNames <- unique(Gsl$termName)  # 169 unique terms | myGslTermNames <- unique(Gsl$termName)  # 169 unique terms | ||||||
| myGslTermNames[grep("cycle", myGslTermNames)] | myGslTermNames[grep("cycle", myGslTermNames)] | ||||||
| # [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle" | # [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle" | ||||||
|  |  | ||||||
| # Choose "mitotic cell cycle" as the GOslim term to subset with | # Choose "mitotic cell cycle" as the GOslim term to subset with | ||||||
|  |  | ||||||
| scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"]) | scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"]) | ||||||
| length(scCCgenes)  # 324 genes annotated to that term | length(scCCgenes)  # 324 genes annotated to that term | ||||||
|  |  | ||||||
| # ==   3.1  Intersect interactions and annotations  ============================ | # ==   3.1  Intersect interactions and annotations  ============================ | ||||||
|  |  | ||||||
| sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence | sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence | ||||||
| #                                # functional interactions | #                                # functional interactions | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  DEFINE THE CELL-CYCLE NETWORK  ======================================= | # =    4  DEFINE THE CELL-CYCLE NETWORK  ======================================= | ||||||
| # | # | ||||||
| # Define scCCnet ... the S. Cervisiae Cell Cycle network | # Define scCCnet ... the S. Cervisiae Cell Cycle network | ||||||
| # Subset all rows for which BOTH genes are in the GOslim cell cycle set | # Subset all rows for which BOTH genes are in the GOslim cell cycle set | ||||||
| # | # | ||||||
| scCCnet <- STR[(STR$protein1 %in% scCCgenes) & | scCCnet <- STR[(STR$protein1 %in% scCCgenes) & | ||||||
|                (STR$protein2 %in% scCCgenes), ] |                (STR$protein2 %in% scCCgenes), ] | ||||||
|  |  | ||||||
| # How many genes are there? | # How many genes are there? | ||||||
| length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283 | length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283 | ||||||
|  |  | ||||||
| # Each edge is listed twice - now remove duplicates. | # Each edge is listed twice - now remove duplicates. | ||||||
|  |  | ||||||
| # Step 1: make a vector: sort two names so the fiRst one is alphabetically | # Step 1: make a vector: sort two names so the fiRst one is alphabetically | ||||||
| #         smaller Than the second one. This brings the two names into a defined | #         smaller Than the second one. This brings the two names into a defined | ||||||
| #         order. Then concatenate them with a "." - the resulting string | #         order. Then concatenate them with a "." - the resulting string | ||||||
| #         is always the same, for any order. E.g. c("A", "B") gives "A.B" | #         is always the same, for any order. E.g. c("A", "B") gives "A.B" | ||||||
| #         and c("B", "A") also gives "A.B". This identifies duplicates. | #         and c("B", "A") also gives "A.B". This identifies duplicates. | ||||||
|  |  | ||||||
| x <- apply(cbind(scCCnet$protein1, scCCnet$protein2), | x <- apply(cbind(scCCnet$protein1, scCCnet$protein2), | ||||||
|            1, |            1, | ||||||
|            FUN = function(x) { return(paste(sort(x), collapse = ".")) }) |            FUN = function(x) { return(paste(sort(x), collapse = ".")) }) | ||||||
| head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc. | head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc. | ||||||
|  |  | ||||||
| sum(duplicated(x))  # 1453 | sum(duplicated(x))  # 1453 | ||||||
|  |  | ||||||
| # Step 2: drop all rows that contain duplicates in x | # Step 2: drop all rows that contain duplicates in x | ||||||
| scCCnet <- scCCnet[! duplicated(x), ] | scCCnet <- scCCnet[! duplicated(x), ] | ||||||
|  |  | ||||||
| # Confirm we didn't loose genes | # Confirm we didn't loose genes | ||||||
| length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change | length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change | ||||||
| nrow(scCCnet) | nrow(scCCnet) | ||||||
| # Network has 283 nodes, 1453 edges | # Network has 283 nodes, 1453 edges | ||||||
|  |  | ||||||
| saveRDS(scCCnet, file = "./data/scCCnet.rds") | saveRDS(scCCnet, file = "./data/scCCnet.rds") | ||||||
|  |  | ||||||
| # scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the | # scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the | ||||||
|                                              #      object when needed |                                              #      object when needed | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,135 +1,135 @@ | |||||||
| # tocID <- "scripts/ABC-writeALN.R" | # tocID <- "scripts/ABC-writeALN.R" | ||||||
| # | # | ||||||
| # ToDo:    calculate consensus line | # ToDo:    calculate consensus line | ||||||
| #          append sequence numbers | #          append sequence numbers | ||||||
| # Notes: | # Notes: | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| writeALN <- function(ali, | writeALN <- function(ali, | ||||||
|                      range, |                      range, | ||||||
|                      note = "", |                      note = "", | ||||||
|                      myCon = stdout(), |                      myCon = stdout(), | ||||||
|                      blockWidth = 60) { |                      blockWidth = 60) { | ||||||
|   # Purpose: |   # Purpose: | ||||||
|   #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or |   #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or | ||||||
|   #     a file in multi-FASTA format. |   #     a file in multi-FASTA format. | ||||||
|   # Version: 2.0 |   # Version: 2.0 | ||||||
|   # Date:    2017 10 |   # Date:    2017 10 | ||||||
|   # Author:  Boris Steipe |   # Author:  Boris Steipe | ||||||
|   # |   # | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #     ali             MsaAAMultipleAlignment or AAStringSet or character |   #     ali             MsaAAMultipleAlignment or AAStringSet or character | ||||||
|   #                       vector. |   #                       vector. | ||||||
|   #     range      num  a two-integer vector of start and end positions if |   #     range      num  a two-integer vector of start and end positions if | ||||||
|   #                       only a range of the MSA should be written, e.g. |   #                       only a range of the MSA should be written, e.g. | ||||||
|   #                       a domain. Defaults to the full alignment length. |   #                       a domain. Defaults to the full alignment length. | ||||||
|   #     note       chr  a vector of character that is appended to the name |   #     note       chr  a vector of character that is appended to the name | ||||||
|   #                       of a sequence in the FASTA header. Recycling of |   #                       of a sequence in the FASTA header. Recycling of | ||||||
|   #                       shorter vectors applies, thus a vector of length one |   #                       shorter vectors applies, thus a vector of length one | ||||||
|   #                       is added to all headers. |   #                       is added to all headers. | ||||||
|   #     myCon           a connection (cf. the con argument for writeLines). |   #     myCon           a connection (cf. the con argument for writeLines). | ||||||
|   #                       Defaults to stdout() |   #                       Defaults to stdout() | ||||||
|   #     blockWidth int  width of sequence block. Default 80 characters. |   #     blockWidth int  width of sequence block. Default 80 characters. | ||||||
|   # Value: |   # Value: | ||||||
|   #     NA   the function is invoked for its side effect of printing an |   #     NA   the function is invoked for its side effect of printing an | ||||||
|   #          alignment to stdout() or file. |   #          alignment to stdout() or file. | ||||||
|  |  | ||||||
|   blockWidth <- as.integer(blockWidth) |   blockWidth <- as.integer(blockWidth) | ||||||
|   if (is.na(blockWidth)) { |   if (is.na(blockWidth)) { | ||||||
|     stop("PANIC: parameter \"blockWidth\" must be numeric.") |     stop("PANIC: parameter \"blockWidth\" must be numeric.") | ||||||
|   } |   } | ||||||
|   if (blockWidth < 1) { |   if (blockWidth < 1) { | ||||||
|     stop("PANIC: parameter \"blockWidth\" must be greater than zero.") |     stop("PANIC: parameter \"blockWidth\" must be greater than zero.") | ||||||
|   } |   } | ||||||
|   if (blockWidth > 60) { |   if (blockWidth > 60) { | ||||||
|     warning("Programs that read CLUSTAL format might not expect blockWidth > 60.") |     warning("Programs that read CLUSTAL format might not expect blockWidth > 60.") | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   # Extract the raw data from the objects depending on their respective class |   # Extract the raw data from the objects depending on their respective class | ||||||
|   # and put it into a named vector of strings. |   # and put it into a named vector of strings. | ||||||
|  |  | ||||||
|   # Extract XStringSet from MsaXMultipleAlignment ... |   # Extract XStringSet from MsaXMultipleAlignment ... | ||||||
|   if (class(ali) == "MsaAAMultipleAlignment" | |   if (class(ali) == "MsaAAMultipleAlignment" | | ||||||
|       class(ali) == "MsaDNAMultipleAlignment" | |       class(ali) == "MsaDNAMultipleAlignment" | | ||||||
|       class(ali) == "MsaRNAMultipleAlignment") { |       class(ali) == "MsaRNAMultipleAlignment") { | ||||||
|       ali <- ali@unmasked |       ali <- ali@unmasked | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   # Process XStringSet |   # Process XStringSet | ||||||
|   if (class(ali) == "AAStringSet" | |   if (class(ali) == "AAStringSet" | | ||||||
|       class(ali) == "DNAStringSet" | |       class(ali) == "DNAStringSet" | | ||||||
|       class(ali) == "RNAStringSet") { |       class(ali) == "RNAStringSet") { | ||||||
|     sSet <- as.character(ali) # we use as.character(), not toString() thus |     sSet <- as.character(ali) # we use as.character(), not toString() thus | ||||||
|                               # we don't _have_ to load Biostrings |                               # we don't _have_ to load Biostrings | ||||||
|   } else if (class(ali) == "character") { |   } else if (class(ali) == "character") { | ||||||
|     sSet <- ali |     sSet <- ali | ||||||
|   } else { |   } else { | ||||||
|     stop(paste("Input object of class", |     stop(paste("Input object of class", | ||||||
|                class(ali), |                class(ali), | ||||||
|                "can't be handled by this function.")) |                "can't be handled by this function.")) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (missing(range)) { |   if (missing(range)) { | ||||||
|     range <- 1 |     range <- 1 | ||||||
|     range[2] <- max(nchar(sSet)) |     range[2] <- max(nchar(sSet)) | ||||||
|   } else { |   } else { | ||||||
|     range <- as.integer(range) |     range <- as.integer(range) | ||||||
|     if(length(range) != 2 || |     if(length(range) != 2 || | ||||||
|        any(is.na(range)) || |        any(is.na(range)) || | ||||||
|        range[1] > range[2] || |        range[1] > range[2] || | ||||||
|        range[1] < 1) { |        range[1] < 1) { | ||||||
|       stop("PANIC: \"range\" parameter must contain valid start and end index.") |       stop("PANIC: \"range\" parameter must contain valid start and end index.") | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   # Right-pad any sequence with "-" that is shorter than ranges[2] |   # Right-pad any sequence with "-" that is shorter than ranges[2] | ||||||
|     for (i in seq_along(sSet)) { |     for (i in seq_along(sSet)) { | ||||||
|       if (nchar(sSet[i]) < range[2]) { |       if (nchar(sSet[i]) < range[2]) { | ||||||
|         sSet[i] <- paste0(sSet[i], |         sSet[i] <- paste0(sSet[i], | ||||||
|                           paste0(rep("-", range[2] - nchar(sSet[i])), |                           paste0(rep("-", range[2] - nchar(sSet[i])), | ||||||
|                                  collapse = "")) |                                  collapse = "")) | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   # Right-pad sequence names |   # Right-pad sequence names | ||||||
|   sNames <- names(sSet) |   sNames <- names(sSet) | ||||||
|   len <- max(nchar(sNames)) + 2 # longest name plus two spaces |   len <- max(nchar(sNames)) + 2 # longest name plus two spaces | ||||||
|   for (i in seq_along(sNames)) { |   for (i in seq_along(sNames)) { | ||||||
|     sNames[i] <- paste0(sNames[i], |     sNames[i] <- paste0(sNames[i], | ||||||
|                       paste0(rep(" ", len - nchar(sNames[i])), |                       paste0(rep(" ", len - nchar(sNames[i])), | ||||||
|                              collapse = "")) |                              collapse = "")) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   # Process each sequence |   # Process each sequence | ||||||
|   txt <- paste0("CLUSTAL W format. ", note) |   txt <- paste0("CLUSTAL W format. ", note) | ||||||
|   txt[2] <- "" |   txt[2] <- "" | ||||||
|  |  | ||||||
|   iStarts <- seq(range[1], range[2], by = blockWidth) |   iStarts <- seq(range[1], range[2], by = blockWidth) | ||||||
|   iEnds <- c((iStarts[-1] - 1), range[2]) |   iEnds <- c((iStarts[-1] - 1), range[2]) | ||||||
|  |  | ||||||
|   for (i in seq_along(iStarts)) { |   for (i in seq_along(iStarts)) { | ||||||
|     for (j in seq_along(sSet)) { |     for (j in seq_along(sSet)) { | ||||||
|       txt <- c(txt, |       txt <- c(txt, | ||||||
|                paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i]))) |                paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i]))) | ||||||
|     } |     } | ||||||
|     txt <- c(txt, "")  # append a blank consenus line |     txt <- c(txt, "")  # append a blank consenus line | ||||||
|     txt <- c(txt, "")  # append a separator line |     txt <- c(txt, "")  # append a separator line | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   writeLines(txt, con= myCon) |   writeLines(txt, con= myCon) | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| # ====  TESTS  ================================================================= | # ====  TESTS  ================================================================= | ||||||
| # Enter your function tests here... | # Enter your function tests here... | ||||||
|  |  | ||||||
| if (FALSE) { | if (FALSE) { | ||||||
|   # test ... |   # test ... | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -1,121 +1,121 @@ | |||||||
| # ABC-writeMFA.R | # ABC-writeMFA.R | ||||||
| # | # | ||||||
| # ToDo: | # ToDo: | ||||||
| # Notes:  2.1  bugfix: empty notes caused superfluous blank after header. | # Notes:  2.1  bugfix: empty notes caused superfluous blank after header. | ||||||
| # | # | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| writeMFA <- function(ali, | writeMFA <- function(ali, | ||||||
|                      range, |                      range, | ||||||
|                      note = "", |                      note = "", | ||||||
|                      myCon = stdout(), |                      myCon = stdout(), | ||||||
|                      blockWidth = 80) { |                      blockWidth = 80) { | ||||||
|   # Purpose: |   # Purpose: | ||||||
|   #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or |   #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or | ||||||
|   #     a file in multi-FASTA format. |   #     a file in multi-FASTA format. | ||||||
|   # Version: 2.1 |   # Version: 2.1 | ||||||
|   # Date:    2017  10 |   # Date:    2017  10 | ||||||
|   # Author:  Boris Steipe |   # Author:  Boris Steipe | ||||||
|   # |   # | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #     ali             MsaAAMultipleAlignment or AAStringSet or character |   #     ali             MsaAAMultipleAlignment or AAStringSet or character | ||||||
|   #                       vector |   #                       vector | ||||||
|   #     range      num  a two-integer vector of start and end positions if |   #     range      num  a two-integer vector of start and end positions if | ||||||
|   #                       only a range of the MSA should be written, e.g. |   #                       only a range of the MSA should be written, e.g. | ||||||
|   #                       a domain. Defaults to the full sequence length. |   #                       a domain. Defaults to the full sequence length. | ||||||
|   #     note       chr  a vector of character that is appended to the name |   #     note       chr  a vector of character that is appended to the name | ||||||
|   #                       of a sequence in the FASTA header. Recycling of |   #                       of a sequence in the FASTA header. Recycling of | ||||||
|   #                       shorter vectors applies, thus a vector of length one |   #                       shorter vectors applies, thus a vector of length one | ||||||
|   #                       is added to all headers. |   #                       is added to all headers. | ||||||
|   #     myCon           a connection (cf. the con argument for writeLines). |   #     myCon           a connection (cf. the con argument for writeLines). | ||||||
|   #                       Defaults to stdout() |   #                       Defaults to stdout() | ||||||
|   #     blockWidth int  width of sequence block. Default 80 characters. |   #     blockWidth int  width of sequence block. Default 80 characters. | ||||||
|   # Value: |   # Value: | ||||||
|   #     NA   the function is invoked for its side effect of printing an |   #     NA   the function is invoked for its side effect of printing an | ||||||
|   #          alignment to stdout() or file. |   #          alignment to stdout() or file. | ||||||
|  |  | ||||||
|   blockWidth <- as.integer(blockWidth) |   blockWidth <- as.integer(blockWidth) | ||||||
|   if (is.na(blockWidth)) { |   if (is.na(blockWidth)) { | ||||||
|     stop("PANIC: parameter \"blockWidth\" must be numeric.") |     stop("PANIC: parameter \"blockWidth\" must be numeric.") | ||||||
|   } |   } | ||||||
|   if (! blockWidth > 0){ |   if (! blockWidth > 0){ | ||||||
|     stop("PANIC: parameter \"blockWidth\" must be greater than zero.") |     stop("PANIC: parameter \"blockWidth\" must be greater than zero.") | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   # Extract the raw data from the objects depending on their respective class |   # Extract the raw data from the objects depending on their respective class | ||||||
|   # and put it into a named vector of strings. |   # and put it into a named vector of strings. | ||||||
|  |  | ||||||
|   # Extract XStringSet from MsaXMultipleAlignment ... |   # Extract XStringSet from MsaXMultipleAlignment ... | ||||||
|   if (class(ali) == "MsaAAMultipleAlignment" | |   if (class(ali) == "MsaAAMultipleAlignment" | | ||||||
|       class(ali) == "MsaDNAMultipleAlignment" | |       class(ali) == "MsaDNAMultipleAlignment" | | ||||||
|       class(ali) == "MsaRNAMultipleAlignment") { |       class(ali) == "MsaRNAMultipleAlignment") { | ||||||
|       ali <- ali@unmasked |       ali <- ali@unmasked | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   # Process XStringSet |   # Process XStringSet | ||||||
|   if (class(ali) == "AAStringSet" | |   if (class(ali) == "AAStringSet" | | ||||||
|       class(ali) == "DNAStringSet" | |       class(ali) == "DNAStringSet" | | ||||||
|       class(ali) == "RNAStringSet") { |       class(ali) == "RNAStringSet") { | ||||||
|     sSet <- as.character(ali) # we use as.character(), not toString() thus |     sSet <- as.character(ali) # we use as.character(), not toString() thus | ||||||
|                               # we don't _have_ to load Biostrings |                               # we don't _have_ to load Biostrings | ||||||
|   } else if (class(ali) == "character") { |   } else if (class(ali) == "character") { | ||||||
|     sSet <- ali |     sSet <- ali | ||||||
|   } else { |   } else { | ||||||
|     stop(paste("Input object of class", |     stop(paste("Input object of class", | ||||||
|                class(ali), |                class(ali), | ||||||
|                "can't be handled by this function.")) |                "can't be handled by this function.")) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (missing(range)) { |   if (missing(range)) { | ||||||
|     range <- 1 |     range <- 1 | ||||||
|     range[2] <- max(nchar(sSet)) |     range[2] <- max(nchar(sSet)) | ||||||
|   } else { |   } else { | ||||||
|     range <- as.integer(range) |     range <- as.integer(range) | ||||||
|     if(length(range) != 2 || |     if(length(range) != 2 || | ||||||
|        any(is.na(range)) || |        any(is.na(range)) || | ||||||
|        range[1] > range[2] || |        range[1] > range[2] || | ||||||
|        range[1] < 1) { |        range[1] < 1) { | ||||||
|       stop("PANIC: \"range\" parameter must contain valid start and end index.") |       stop("PANIC: \"range\" parameter must contain valid start and end index.") | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   # Process each sequence |   # Process each sequence | ||||||
|   txt <- character() |   txt <- character() | ||||||
|   if (note != "") {  # construct header line |   if (note != "") {  # construct header line | ||||||
|     headers <- paste(names(sSet), note) |     headers <- paste(names(sSet), note) | ||||||
|   } else { |   } else { | ||||||
|     headers <- names(sSet) |     headers <- names(sSet) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   for (i in seq_along(sSet)) { |   for (i in seq_along(sSet)) { | ||||||
|  |  | ||||||
|     # output FASTA header |     # output FASTA header | ||||||
|     txt <- c(txt, sprintf(">%s", headers[i])) |     txt <- c(txt, sprintf(">%s", headers[i])) | ||||||
|  |  | ||||||
|     # output the sequence in blocks of blockWidth per line ... |     # output the sequence in blocks of blockWidth per line ... | ||||||
|     iStarts <- seq(range[1], range[2], by = blockWidth) |     iStarts <- seq(range[1], range[2], by = blockWidth) | ||||||
|     iEnds <- c((iStarts[-1] - 1), range[2]) |     iEnds <- c((iStarts[-1] - 1), range[2]) | ||||||
|  |  | ||||||
|     thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks |     thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks | ||||||
|     thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks |     thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks | ||||||
|     txt <- c(txt, thisSeq) |     txt <- c(txt, thisSeq) | ||||||
|  |  | ||||||
|     txt <- c(txt, "")  # append an empty line for readability |     txt <- c(txt, "")  # append an empty line for readability | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   writeLines(txt, con = myCon) |   writeLines(txt, con = myCon) | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| # ====  TESTS  ================================================================= | # ====  TESTS  ================================================================= | ||||||
| # Enter your function tests here... | # Enter your function tests here... | ||||||
|  |  | ||||||
| if (FALSE) { | if (FALSE) { | ||||||
|   # test ... |   # test ... | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										768
									
								
								scripts/BLAST.R
									
									
									
									
									
								
							
							
						
						
									
										768
									
								
								scripts/BLAST.R
									
									
									
									
									
								
							| @@ -1,384 +1,384 @@ | |||||||
| # BLAST.R | # BLAST.R | ||||||
| # | # | ||||||
| # Purpose: Send off one BLAST search and return parsed list of results | # Purpose: Send off one BLAST search and return parsed list of results | ||||||
| #          This script uses the BLAST URL-API | #          This script uses the BLAST URL-API | ||||||
| #          (Application Programming Interface) at the NCBI. | #          (Application Programming Interface) at the NCBI. | ||||||
| #          Read about the constraints here: | #          Read about the constraints here: | ||||||
| #          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo | #          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo | ||||||
| # | # | ||||||
| # | # | ||||||
| # Version: 3.2 | # Version: 3.2 | ||||||
| # Date:    2016 09 - 2020 09 | # Date:    2016 09 - 2020 09 | ||||||
| # Author:  Boris Steipe | # Author:  Boris Steipe | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
| #    3.2   2020 updates | #    3.2   2020 updates | ||||||
| #    3.1   Change from require() to requireNamespace(), | #    3.1   Change from require() to requireNamespace(), | ||||||
| #          use <package>::<function>() idiom throughout | #          use <package>::<function>() idiom throughout | ||||||
| #    3.0   parsing logic had not been fully implemented; Fixed. | #    3.0   parsing logic had not been fully implemented; Fixed. | ||||||
| #    2.1   bugfix in BLAST(), bug was blanking non-split deflines; | #    2.1   bugfix in BLAST(), bug was blanking non-split deflines; | ||||||
| #          refactored parseBLASTalignment() to handle lists with multiple hits. | #          refactored parseBLASTalignment() to handle lists with multiple hits. | ||||||
| #    2.0   Completely rewritten because the interface completely changed. | #    2.0   Completely rewritten because the interface completely changed. | ||||||
| #          Code adpated in part from NCBI Perl sample code: | #          Code adpated in part from NCBI Perl sample code: | ||||||
| #          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $ | #          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $ | ||||||
| #    1.0   first version posted for BCH441 2016, based on BLAST - API | #    1.0   first version posted for BCH441 2016, based on BLAST - API | ||||||
| # | # | ||||||
| # ToDo:    Return the organism/strain name in the output, and propagate | # ToDo:    Return the organism/strain name in the output, and propagate | ||||||
| #          into MYSPE selection script. | #          into MYSPE selection script. | ||||||
| # | # | ||||||
| # Notes:   This is somewhat pedestrian, but apparently there are currently | # Notes:   This is somewhat pedestrian, but apparently there are currently | ||||||
| #          no R packages that contain such code. | #          no R packages that contain such code. | ||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! requireNamespace("httr", quietly = TRUE)) { | if (! requireNamespace("httr", quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| BLAST <- function(Q, | BLAST <- function(Q, | ||||||
|                   db = "refseq_protein", |                   db = "refseq_protein", | ||||||
|                   nHits = 30, |                   nHits = 30, | ||||||
|                   E = 0.1, |                   E = 0.1, | ||||||
|                   limits = "", |                   limits = "", | ||||||
|                   rid = "", |                   rid = "", | ||||||
|                   query = "", |                   query = "", | ||||||
|                   quietly = FALSE, |                   quietly = FALSE, | ||||||
|                   myTimeout = 120) { |                   myTimeout = 120) { | ||||||
|     # Purpose: |     # Purpose: | ||||||
|     #     Basic BLAST search |     #     Basic BLAST search | ||||||
|     # |     # | ||||||
|     # Parameters: |     # Parameters: | ||||||
|     #     Q: query - either a valid ID or a sequence |     #     Q: query - either a valid ID or a sequence | ||||||
|     #     db: "refseq_protein" by default, |     #     db: "refseq_protein" by default, | ||||||
|     #         other legal values include: "nr", "pdb", "swissprot" ... |     #         other legal values include: "nr", "pdb", "swissprot" ... | ||||||
|     #     nHits: number of hits to maximally return |     #     nHits: number of hits to maximally return | ||||||
|     #     E: E-value cutoff. Do not return hits whose score would be expected |     #     E: E-value cutoff. Do not return hits whose score would be expected | ||||||
|     #        to occur E or more times in a database of random sequence. |     #        to occur E or more times in a database of random sequence. | ||||||
|     #     limits: a valid ENTREZ filter |     #     limits: a valid ENTREZ filter | ||||||
|     #     rid: a request ID - to retrieve earlier search results |     #     rid: a request ID - to retrieve earlier search results | ||||||
|     #     query: the actual query string (needed when retrieving results |     #     query: the actual query string (needed when retrieving results | ||||||
|     #            with an rid) |     #            with an rid) | ||||||
|     #     quietly: controls printing of wait-time progress bar |     #     quietly: controls printing of wait-time progress bar | ||||||
|     #     timeout: how much longer _after_ rtoe to wait for a result |     #     timeout: how much longer _after_ rtoe to wait for a result | ||||||
|     #              before giving up (seconds) |     #              before giving up (seconds) | ||||||
|     # Value: |     # Value: | ||||||
|     #     result: list of process status or resulting hits, and some metadata |     #     result: list of process status or resulting hits, and some metadata | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done |     EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done | ||||||
|  |  | ||||||
|     results <- list() |     results <- list() | ||||||
|     results$query = query |     results$query = query | ||||||
|     results$rid <- rid |     results$rid <- rid | ||||||
|     results$rtoe <- 0 |     results$rtoe <- 0 | ||||||
|  |  | ||||||
|     if (rid == "") {  # If no rid is available, spawn a search. |     if (rid == "") {  # If no rid is available, spawn a search. | ||||||
|                       # Else, proceed directly to retrieval. |                       # Else, proceed directly to retrieval. | ||||||
|  |  | ||||||
|       # prepare query, GET(), and parse rid and rtoe from BLAST server response |       # prepare query, GET(), and parse rid and rtoe from BLAST server response | ||||||
|       results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", |       results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", | ||||||
|                               "?", |                               "?", | ||||||
|                               "CMD=Put", |                               "CMD=Put", | ||||||
|                               "&PROGRAM=", "blastp", |                               "&PROGRAM=", "blastp", | ||||||
|                               "&QUERY=", URLencode(Q), |                               "&QUERY=", URLencode(Q), | ||||||
|                               "&DATABASE=", db, |                               "&DATABASE=", db, | ||||||
|                               "&MATRIX=", "BLOSUM62", |                               "&MATRIX=", "BLOSUM62", | ||||||
|                               "&EXPECT=", as.character(E), |                               "&EXPECT=", as.character(E), | ||||||
|                               "&HITLIST_SIZE=", as.character(nHits), |                               "&HITLIST_SIZE=", as.character(nHits), | ||||||
|                               "&ALIGNMENTS=", as.character(nHits), |                               "&ALIGNMENTS=", as.character(nHits), | ||||||
|                               "&FORMAT_TYPE=Text") |                               "&FORMAT_TYPE=Text") | ||||||
|  |  | ||||||
|       if (limits != "") { |       if (limits != "") { | ||||||
|         results$query <- paste0( |         results$query <- paste0( | ||||||
|           results$query, |           results$query, | ||||||
|           "&ENTREZ_QUERY=", limits) |           "&ENTREZ_QUERY=", limits) | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       # send it off ... |       # send it off ... | ||||||
|       response <- httr::GET(results$query) |       response <- httr::GET(results$query) | ||||||
|       if (httr::http_status(response)$category != "Success" ) { |       if (httr::http_status(response)$category != "Success" ) { | ||||||
|         stop(sprintf("PANIC: Can't send query. BLAST server status error: %s", |         stop(sprintf("PANIC: Can't send query. BLAST server status error: %s", | ||||||
|                      httr::http_status(response)$message)) |                      httr::http_status(response)$message)) | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       txt <- httr::content(response, "text", encoding = "UTF-8") |       txt <- httr::content(response, "text", encoding = "UTF-8") | ||||||
|  |  | ||||||
|       patt <- "RID = (\\w+)" # match the request id |       patt <- "RID = (\\w+)" # match the request id | ||||||
|       results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2] |       results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2] | ||||||
|  |  | ||||||
|       patt <- "RTOE = (\\d+)" # match the expected completion time |       patt <- "RTOE = (\\d+)" # match the expected completion time | ||||||
|       results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2]) |       results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2]) | ||||||
|  |  | ||||||
|       # Now we wait ... |       # Now we wait ... | ||||||
|       if (quietly) { |       if (quietly) { | ||||||
|         Sys.sleep(results$rtoe) |         Sys.sleep(results$rtoe) | ||||||
|       } else { |       } else { | ||||||
|         cat(sprintf("BLAST is processing %s:\n", results$rid)) |         cat(sprintf("BLAST is processing %s:\n", results$rid)) | ||||||
|         waitTimer(results$rtoe) |         waitTimer(results$rtoe) | ||||||
|       } |       } | ||||||
|  |  | ||||||
|     } # done sending query and retrieving rid, rtoe |     } # done sending query and retrieving rid, rtoe | ||||||
|  |  | ||||||
|     # Enter an infinite loop to check for result availability |     # Enter an infinite loop to check for result availability | ||||||
|     checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", |     checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", | ||||||
|                          "?", |                          "?", | ||||||
|                          "CMD=Get", |                          "CMD=Get", | ||||||
|                          "&RID=", results$rid, |                          "&RID=", results$rid, | ||||||
|                          "&FORMAT_TYPE=Text", |                          "&FORMAT_TYPE=Text", | ||||||
|                          "&FORMAT_OBJECT=SearchInfo", |                          "&FORMAT_OBJECT=SearchInfo", | ||||||
|                          sep = "") |                          sep = "") | ||||||
|  |  | ||||||
|     while (TRUE) { |     while (TRUE) { | ||||||
|       # Check whether the result is ready |       # Check whether the result is ready | ||||||
|       response <- httr::GET(checkStatus) |       response <- httr::GET(checkStatus) | ||||||
|       if (httr::http_status(response)$category != "Success" ) { |       if (httr::http_status(response)$category != "Success" ) { | ||||||
|         stop(sprintf("PANIC: Can't check status. BLAST server status error: %s", |         stop(sprintf("PANIC: Can't check status. BLAST server status error: %s", | ||||||
|                      httr::http_status(response)$message)) |                      httr::http_status(response)$message)) | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       txt <- httr::content(response, "text", encoding = "UTF-8") |       txt <- httr::content(response, "text", encoding = "UTF-8") | ||||||
|  |  | ||||||
|       if (length(grep("Status=WAITING",  txt)) > 0) { |       if (length(grep("Status=WAITING",  txt)) > 0) { | ||||||
|         myTimeout <- myTimeout - EXTRAWAIT |         myTimeout <- myTimeout - EXTRAWAIT | ||||||
|  |  | ||||||
|         if (myTimeout <= 0) { # abort |         if (myTimeout <= 0) { # abort | ||||||
|           cat("BLAST search not concluded before timeout. Aborting.\n") |           cat("BLAST search not concluded before timeout. Aborting.\n") | ||||||
|           cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n", |           cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n", | ||||||
|                       "Trying checking back later with >", |                       "Trying checking back later with >", | ||||||
|                       results$rid)) |                       results$rid)) | ||||||
|           return(results) |           return(results) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         if (quietly) { |         if (quietly) { | ||||||
|           Sys.sleep(EXTRAWAIT) |           Sys.sleep(EXTRAWAIT) | ||||||
|         } else { |         } else { | ||||||
|           cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)", |           cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)", | ||||||
|                       EXTRAWAIT, |                       EXTRAWAIT, | ||||||
|                       myTimeout)) |                       myTimeout)) | ||||||
|           waitTimer(EXTRAWAIT) |           waitTimer(EXTRAWAIT) | ||||||
|           next |           next | ||||||
|         } |         } | ||||||
|  |  | ||||||
|       } else if (length(grep("Status=FAILED",  txt)) > 0) { |       } else if (length(grep("Status=FAILED",  txt)) > 0) { | ||||||
|           cat("BLAST search returned status \"FAILED\". Aborting.\n") |           cat("BLAST search returned status \"FAILED\". Aborting.\n") | ||||||
|           return(results) |           return(results) | ||||||
|  |  | ||||||
|       } else if (length(grep("Status=UNKNOWN",  txt)) > 0) { |       } else if (length(grep("Status=UNKNOWN",  txt)) > 0) { | ||||||
|           cat("BLAST search returned status \"UNKNOWN\".\n") |           cat("BLAST search returned status \"UNKNOWN\".\n") | ||||||
|           cat("This probably means the rid has expired. Aborting.\n") |           cat("This probably means the rid has expired. Aborting.\n") | ||||||
|           return(results) |           return(results) | ||||||
|  |  | ||||||
|       } else if (length(grep("Status=READY",  txt)) > 0) {  # Done |       } else if (length(grep("Status=READY",  txt)) > 0) {  # Done | ||||||
|  |  | ||||||
|           if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits |           if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits | ||||||
|             cat("BLAST search ready but no hits found. Aborting.\n") |             cat("BLAST search ready but no hits found. Aborting.\n") | ||||||
|             return(results) |             return(results) | ||||||
|  |  | ||||||
|           } else { |           } else { | ||||||
|             break  # done ... retrieve search result |             break  # done ... retrieve search result | ||||||
|           } |           } | ||||||
|       } |       } | ||||||
|     } # end result-check loop |     } # end result-check loop | ||||||
|  |  | ||||||
|     # retrieve results from BLAST server |     # retrieve results from BLAST server | ||||||
|     retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", |     retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi", | ||||||
|                       "?", |                       "?", | ||||||
|                       "&CMD=Get", |                       "&CMD=Get", | ||||||
|                       "&RID=", results$rid, |                       "&RID=", results$rid, | ||||||
|                       "&FORMAT_TYPE=Text", |                       "&FORMAT_TYPE=Text", | ||||||
|                       sep = "") |                       sep = "") | ||||||
|  |  | ||||||
|     response <- httr::GET(retrieve) |     response <- httr::GET(retrieve) | ||||||
|     if (httr::http_status(response)$category != "Success" ) { |     if (httr::http_status(response)$category != "Success" ) { | ||||||
|       stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s", |       stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s", | ||||||
|                    httr::http_status(response)$message)) |                    httr::http_status(response)$message)) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     txt <- httr::content(response, "text", encoding = "UTF-8") |     txt <- httr::content(response, "text", encoding = "UTF-8") | ||||||
|  |  | ||||||
|     # txt contains the whole set of results. Process: |     # txt contains the whole set of results. Process: | ||||||
|  |  | ||||||
|     # First, we strsplit() on linebreaks: |     # First, we strsplit() on linebreaks: | ||||||
|     txt <- unlist(strsplit(txt, "\n")) |     txt <- unlist(strsplit(txt, "\n")) | ||||||
|  |  | ||||||
|     # The alignments range from the first line that begins with ">" ... |     # The alignments range from the first line that begins with ">" ... | ||||||
|     iFirst <- grep("^>", txt)[1] |     iFirst <- grep("^>", txt)[1] | ||||||
|  |  | ||||||
|     # ... to the last line that begins with "Sbjct" |     # ... to the last line that begins with "Sbjct" | ||||||
|     x <- grep("^Sbjct", txt) |     x <- grep("^Sbjct", txt) | ||||||
|     iLast <- x[length(x)] |     iLast <- x[length(x)] | ||||||
|  |  | ||||||
|     # Get the alignments block |     # Get the alignments block | ||||||
|     txt <- txt[iFirst:iLast] |     txt <- txt[iFirst:iLast] | ||||||
|  |  | ||||||
|     # Drop empty lines |     # Drop empty lines | ||||||
|     txt <- txt[!(nchar(txt) == 0)] |     txt <- txt[!(nchar(txt) == 0)] | ||||||
|  |  | ||||||
|     # A line that ends "]" but does not begin ">" seems to be a split |     # A line that ends "]" but does not begin ">" seems to be a split | ||||||
|     # defline ... eg. |     # defline ... eg. | ||||||
|     #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale " |     #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale " | ||||||
|     #  [2] "EXF-2481]" |     #  [2] "EXF-2481]" | ||||||
|     #  Merge these lines to the preceding lines and delete them. |     #  Merge these lines to the preceding lines and delete them. | ||||||
|     # |     # | ||||||
|     x <- which(grepl("]$", txt) & !(grepl("^>", txt))) |     x <- which(grepl("]$", txt) & !(grepl("^>", txt))) | ||||||
|     if (length(x) > 0) { |     if (length(x) > 0) { | ||||||
|       txt[x-1] <- paste0(txt[x-1], txt[x]) |       txt[x-1] <- paste0(txt[x-1], txt[x]) | ||||||
|       txt <- txt[-x] |       txt <- txt[-x] | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     # Special case: there may be multiple deflines when the BLAST hit is to |     # Special case: there may be multiple deflines when the BLAST hit is to | ||||||
|     # redundant, identical sequences. Keep only the first instance. |     # redundant, identical sequences. Keep only the first instance. | ||||||
|     iKeep <- ! grepl("^>", txt) |     iKeep <- ! grepl("^>", txt) | ||||||
|     x <- rle(iKeep) |     x <- rle(iKeep) | ||||||
|     x$positions <- cumsum(x$lengths) |     x$positions <- cumsum(x$lengths) | ||||||
|     i <- which(x$lengths > 1 & x$values == FALSE) |     i <- which(x$lengths > 1 & x$values == FALSE) | ||||||
|     if (length(i) > 0) { |     if (length(i) > 0) { | ||||||
|       firsts <- x$positions[i] - x$lengths[i] + 1 |       firsts <- x$positions[i] - x$lengths[i] + 1 | ||||||
|       iKeep[firsts] <- TRUE |       iKeep[firsts] <- TRUE | ||||||
|       txt <- txt[iKeep] |       txt <- txt[iKeep] | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     # After this preprocessing the following should be true: |     # After this preprocessing the following should be true: | ||||||
|     # - Every alignment block begins with a defline in which the |     # - Every alignment block begins with a defline in which the | ||||||
|     #   first character is ">" |     #   first character is ">" | ||||||
|     # - There is only one defline in each block. |     # - There is only one defline in each block. | ||||||
|     # - Lines are not split. |     # - Lines are not split. | ||||||
|  |  | ||||||
|     # Make a dataframe of first and last indices of alignment blocks |     # Make a dataframe of first and last indices of alignment blocks | ||||||
|     x <- grep("^>", txt) |     x <- grep("^>", txt) | ||||||
|     blocks <- data.frame(iFirst = x, |     blocks <- data.frame(iFirst = x, | ||||||
|                          iLast  = c((x[-1] - 1), length(txt))) |                          iLast  = c((x[-1] - 1), length(txt))) | ||||||
|  |  | ||||||
|     # Build the hits list by parsing the blocks |     # Build the hits list by parsing the blocks | ||||||
|     results$hits <- list() |     results$hits <- list() | ||||||
|  |  | ||||||
|     for (i in seq_len(nrow(blocks))) { |     for (i in seq_len(nrow(blocks))) { | ||||||
|       thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]] |       thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]] | ||||||
|       results$hits[[i]] <- parseBLASTalignment(thisBlock) |       results$hits[[i]] <- parseBLASTalignment(thisBlock) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     return(results) |     return(results) | ||||||
| } | } | ||||||
|  |  | ||||||
| parseBLASTalignment <- function(hit) { | parseBLASTalignment <- function(hit) { | ||||||
|   # Parse data from a character vector containing a BLAST hit |   # Parse data from a character vector containing a BLAST hit | ||||||
|   # Parameters: |   # Parameters: | ||||||
|   #    hit  char   one BLAST hit as char vector |   #    hit  char   one BLAST hit as char vector | ||||||
|   # Value: |   # Value: | ||||||
|   #          list   $def          chr   defline |   #          list   $def          chr   defline | ||||||
|   #                 $accession    chr   accession number |   #                 $accession    chr   accession number | ||||||
|   #                 $organism     chr   complete organism definition |   #                 $organism     chr   complete organism definition | ||||||
|   #                 $species      chr   binomial species |   #                 $species      chr   binomial species | ||||||
|   #                 $E            num   E value |   #                 $E            num   E value | ||||||
|   #                 $lengthAli    num   length of the alignment |   #                 $lengthAli    num   length of the alignment | ||||||
|   #                 $nIdentitites num   number of identities |   #                 $nIdentitites num   number of identities | ||||||
|   #                 $nGaps        num   number of gaps |   #                 $nGaps        num   number of gaps | ||||||
|   #                 $Qbounds      num   2-element vector of query start-end |   #                 $Qbounds      num   2-element vector of query start-end | ||||||
|   #                 $Sbounds      num   2-element vector of subject start-end |   #                 $Sbounds      num   2-element vector of subject start-end | ||||||
|   #                 $Qseq         chr   query sequence |   #                 $Qseq         chr   query sequence | ||||||
|   #                 $midSeq       chr   midline string |   #                 $midSeq       chr   midline string | ||||||
|   #                 $Sseq         chr   subject sequence |   #                 $Sseq         chr   subject sequence | ||||||
|  |  | ||||||
|   getToken <- function(patt, v) { |   getToken <- function(patt, v) { | ||||||
|     # get the first token identified by pattern patt in character vector v |     # get the first token identified by pattern patt in character vector v | ||||||
|     v <- v[grep(patt, v)] |     v <- v[grep(patt, v)] | ||||||
|     if (length(v) > 1) { v <- v[1] } |     if (length(v) > 1) { v <- v[1] } | ||||||
|     if (length(v) == 0) { token <- NA |     if (length(v) == 0) { token <- NA | ||||||
|     } else { |     } else { | ||||||
|       token <- regmatches(v, regexec(patt, v))[[1]][2] } |       token <- regmatches(v, regexec(patt, v))[[1]][2] } | ||||||
|     return(token) |     return(token) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   h <- list() |   h <- list() | ||||||
|  |  | ||||||
|   # FASTA defline |   # FASTA defline | ||||||
|   h$def <- hit[1] |   h$def <- hit[1] | ||||||
|  |  | ||||||
|   # accesion number (ID), use the first if there are several, separated by "|" |   # accesion number (ID), use the first if there are several, separated by "|" | ||||||
|   patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|" |   patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|" | ||||||
|   h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2] |   h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2] | ||||||
|  |  | ||||||
|   # organism |   # organism | ||||||
|   patt <- "\\[(.+)]" |   patt <- "\\[(.+)]" | ||||||
|   h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2] |   h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2] | ||||||
|  |  | ||||||
|   # species |   # species | ||||||
|   x <- unlist(strsplit(h$organism, "\\s+")) |   x <- unlist(strsplit(h$organism, "\\s+")) | ||||||
|   if (length(x) >= 2) { |   if (length(x) >= 2) { | ||||||
|     h$species <- paste(x[1], x[2]) |     h$species <- paste(x[1], x[2]) | ||||||
|   } else if (length(x) == 1) { |   } else if (length(x) == 1) { | ||||||
|     h$species <- paste(x[1], "sp.") |     h$species <- paste(x[1], "sp.") | ||||||
|   } else { |   } else { | ||||||
|     h$species <- NA |     h$species <- NA | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   # E-value |   # E-value | ||||||
|   h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit)) |   h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit)) | ||||||
|  |  | ||||||
|   # length of alignment |   # length of alignment | ||||||
|   h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit)) |   h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit)) | ||||||
|  |  | ||||||
|   # number of identities |   # number of identities | ||||||
|   h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit)) |   h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit)) | ||||||
|  |  | ||||||
|   # number of gaps |   # number of gaps | ||||||
|   h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit)) |   h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit)) | ||||||
|  |  | ||||||
|   # split up alignment section |   # split up alignment section | ||||||
|   idx <- grep("^Query ", hit) |   idx <- grep("^Query ", hit) | ||||||
|   Que <- hit[idx] |   Que <- hit[idx] | ||||||
|   Mid <- hit[idx + 1] |   Mid <- hit[idx + 1] | ||||||
|   Sbj <- hit[idx + 2] |   Sbj <- hit[idx + 2] | ||||||
|  |  | ||||||
|   # first and last positions |   # first and last positions | ||||||
|   h$Qbounds <- c(start = 0, end = 0) |   h$Qbounds <- c(start = 0, end = 0) | ||||||
|   h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1])) |   h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1])) | ||||||
|   h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)])) |   h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)])) | ||||||
|  |  | ||||||
|   h$Sbounds <- c(start = 0, end = 0) |   h$Sbounds <- c(start = 0, end = 0) | ||||||
|   h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1])) |   h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1])) | ||||||
|   h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)])) |   h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)])) | ||||||
|  |  | ||||||
|   # aligned sequences |   # aligned sequences | ||||||
|   for (i in seq_along(Que)) { |   for (i in seq_along(Que)) { | ||||||
|     patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string |     patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string | ||||||
|     m <- regexec(patt, Que[i]) |     m <- regexec(patt, Que[i]) | ||||||
|     iFirst <- m[[1]][2] |     iFirst <- m[[1]][2] | ||||||
|     iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1 |     iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1 | ||||||
|     Que[i] <- substring(Que[i], iFirst, iLast) |     Que[i] <- substring(Que[i], iFirst, iLast) | ||||||
|     Mid[i] <- substring(Mid[i], iFirst, iLast) |     Mid[i] <- substring(Mid[i], iFirst, iLast) | ||||||
|     Sbj[i] <- substring(Sbj[i], iFirst, iLast) |     Sbj[i] <- substring(Sbj[i], iFirst, iLast) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   h$Qseq   <- paste0(Que, collapse = "") |   h$Qseq   <- paste0(Que, collapse = "") | ||||||
|   h$midSeq <- paste0(Mid, collapse = "") |   h$midSeq <- paste0(Mid, collapse = "") | ||||||
|   h$Sseq   <- paste0(Sbj, collapse = "") |   h$Sseq   <- paste0(Sbj, collapse = "") | ||||||
|  |  | ||||||
|   return(h) |   return(h) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==== TESTS =================================================================== | # ==== TESTS =================================================================== | ||||||
|  |  | ||||||
| if (FALSE) { | if (FALSE) { | ||||||
|   # define query: |   # define query: | ||||||
|   q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain |   q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain | ||||||
|                "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", |                "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", | ||||||
|                "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", |                "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", | ||||||
|                sep="") |                sep="") | ||||||
|   # or ... |   # or ... | ||||||
|   q <- "NP_010227" # refseq ID |   q <- "NP_010227" # refseq ID | ||||||
|  |  | ||||||
|   test <- BLAST(q, |   test <- BLAST(q, | ||||||
|                 nHits = 100, |                 nHits = 100, | ||||||
|                 E = 0.001, |                 E = 0.001, | ||||||
|                 rid = "", |                 rid = "", | ||||||
|                 limits = "txid4751[ORGN]")  # Fungi |                 limits = "txid4751[ORGN]")  # Fungi | ||||||
|   str(test) |   str(test) | ||||||
|   length(test$hits) |   length(test$hits) | ||||||
| } | } | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,32 +1,32 @@ | |||||||
| # test_biCode.R | # test_biCode.R | ||||||
| # | # | ||||||
|  |  | ||||||
| context("biCode() utility function tests")  # A set of tests for some | context("biCode() utility function tests")  # A set of tests for some | ||||||
|                                             # functionality |                                             # functionality | ||||||
|  |  | ||||||
| test_that("expected input is processed correctly", {  # Related expectations | test_that("expected input is processed correctly", {  # Related expectations | ||||||
|   expect_equal(biCode("homo sapiens"), "HOMSA") |   expect_equal(biCode("homo sapiens"), "HOMSA") | ||||||
|   expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") |   expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA") | ||||||
|   expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), |   expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")), | ||||||
|                c("PHACI", "MACRU")) |                c("PHACI", "MACRU")) | ||||||
| }) | }) | ||||||
|  |  | ||||||
| test_that("unexpected input is managed", { | test_that("unexpected input is managed", { | ||||||
|   expect_equal(biCode(""), ".....") |   expect_equal(biCode(""), ".....") | ||||||
|   expect_equal(biCode(" "), ".....") |   expect_equal(biCode(" "), ".....") | ||||||
|   expect_equal(biCode("123 12"), ".....") |   expect_equal(biCode("123 12"), ".....") | ||||||
|   expect_equal(biCode("h sapiens"), "H..SA") |   expect_equal(biCode("h sapiens"), "H..SA") | ||||||
| }) | }) | ||||||
|  |  | ||||||
| test_that("NA values are preserved", { | test_that("NA values are preserved", { | ||||||
|   expect_true(is.na((biCode(NA)))) |   expect_true(is.na((biCode(NA)))) | ||||||
|   expect_equal(biCode(c("first", NA, "last")), |   expect_equal(biCode(c("first", NA, "last")), | ||||||
|                c("FIRST", NA, "LAST.")) |                c("FIRST", NA, "LAST.")) | ||||||
| }) | }) | ||||||
|  |  | ||||||
| test_that("Missing argument throws an error", { | test_that("Missing argument throws an error", { | ||||||
|   expect_error(biCode(), "argument \"s\" is missing, with no default") |   expect_error(biCode(), "argument \"s\" is missing, with no default") | ||||||
| }) | }) | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user