Maintenance

2020-09-26 16:45:29 +10:00 · 2020-09-26 16:45:29 +10:00 · 12725799e1
commit 12725799e1
parent 16513dc488
1 changed files with 19 additions and 24 deletions
--- a/BIN-PHYLO-Data_preparation.R
+++ b/BIN-PHYLO-Data_preparation.R
@ -1,20 +1,15 @@
 # tocID <- "BIN-PHYLO-Data_preparation.R"
 #
-# ---------------------------------------------------------------------------- #
-#  PATIENCE  ...                                                               #
-#    Do not yet work wih this code. Updates in progress. Thank you.            #
-#    boris.steipe@utoronto.ca                                                  #
-# ---------------------------------------------------------------------------- #
-#
 # Purpose:  A Bioinformatics Course:
 #              R code accompanying the BIN-PHYLO-Data_preparation unit.
 #
-# Version:  1.1
+# Version:  1.2
 #
-# Date:     2017  10  -  2019  01
+# Date:     2017-10  -  2020-09
 # Author:   Boris Steipe (boris.steipe@utoronto.ca)
 #
 # Versions:
+#           1.2    2020 Maintenance
 #           1.1    Change from require() to requireNamespace(),
 #                      use <package>::<function>() idiom throughout,
 #                      use Biocmanager:: not biocLite()
@ -35,15 +30,15 @@


 #TOC> ==========================================================================
-#TOC>
+#TOC> 
 #TOC>   Section  Title                                     Line
 #TOC> ---------------------------------------------------------
-#TOC>   1        Preparations                                44
-#TOC>   2        Fetching sequences                          76
-#TOC>   3        Multiple Sequence Alignment                117
-#TOC>   4        Reviewing and Editing Alignments           136
-#TOC>   4.1        Masking workflow                         152
-#TOC>
+#TOC>   1        Preparations                                45
+#TOC>   2        Fetching sequences                          77
+#TOC>   3        Multiple Sequence Alignment                118
+#TOC>   4        Reviewing and Editing Alignments           137
+#TOC>   4.1        Masking workflow                         153
+#TOC> 
 #TOC> ==========================================================================


@ -54,7 +49,7 @@
 # been made to the reference files. If you have worked with the prerequiste
 # units, you should have a script named "makeProteinDB.R" that will create the
 # myDB object with a protein and feature database. Ask for advice if not.
-source("makeProteinDB.R")
+source("myScripts/makeProteinDB.R")

 # Load packages we need

@ -172,16 +167,16 @@ for (i in 1:nrow(APSESMsa)) {
 }

 # inspect the result
-msaMatrix[1:7, 1:14]
+msaMatrix[1:7, 30:40]

 # Now let's make a logical vector with an element for each column that selects
 # which columns should be masked out.

 # The number of hyphens in a column is easy to count. Consider:

-    msaMatrix[ , 20]
-    msaMatrix[ , 20] == "-"
-sum(msaMatrix[ , 20] == "-")
+    msaMatrix[ , 20]             # column 20
+    msaMatrix[ , 20] == "-"      # TRUE for all gap characters
+sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE

 # Thus filling our logical vector is simple:

@ -192,7 +187,7 @@ colMask <- logical(ncol(msaMatrix))
 limit <- round(nrow(APSESMsa) * (2/3))

 # iterate over all columns, and write TRUE if there are less-or-equal to "limit"
-# hyphens, FALSE if there are more - i.e. TRUE columns will be used fr analysis
+# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
 # and FALSE columns will be rejected.
 for (i in 1:ncol(msaMatrix)) {
  count <- sum(msaMatrix[ , i] == "-")
@ -230,9 +225,9 @@ writeALN(APSESphyloSet)
 # several indels from the KILA_ESCCO outgroup sequence.


-# We save the aligned, masked domains to a file in multi-FASTA format.
-writeMFA(APSESphyloSet, myCon = "APSESphyloSet.mfa")
-
+# We save the aligned, masked domains to a file in the data/ directory,
+# in multi-FASTA format.
+writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")