New unit - regex practice and sample file

2017-10-02 14:51:11 -04:00 · 2017-10-02 14:51:11 -04:00 · 38e5822ac7
commit 38e5822ac7
parent 64dfed2daf
2 changed files with 149 additions and 29 deletions
--- a/RPR-RegEx.R
+++ b/RPR-RegEx.R
@ -16,51 +16,161 @@
 # == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 #
 # DO NOT SIMPLY  source()  THESE FILES!
-
+#
 # If there are portions you don't understand, use R's help system, Google for an
 # answer, or ask your instructor. Don't continue if you don't understand what's
 #  going on. That's not how it works ...
 #
 # ==============================================================================

-# ==============================================================================
-#        PART ONE: REGEX EXAMPLE
-# ==============================================================================
+#TOC> ==========================================================================
+#TOC>
+#TOC>   Section  Title                          Line
+#TOC> ----------------------------------------------
+#TOC>   1        A regex example                  44
+#TOC>   2        Counting lines                  111
+#TOC>   2.1      Counting C-alpha atoms only     128
+#TOC>   3        Code Solutions                  144
+#TOC>   3.1      Counting atoms                  146
+#TOC>   3.2      Counting C-alpha records        160
+#TOC>
+#TOC> ==========================================================================
+#TOC>
+#TOC>
+#TOC>
+#TOC>

-# The Mbp1 sequence as copied from the NCBI Website
-mySeq <- "
-1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk
-61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha
-121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr
-181 klgvnlqrsq sdmgfprpai pnssisttql psirstmgpq sptlgileee rhdsrqqqpq
-241 qnnsaqfkei dledglssdv epsqqlqqvf nqntgfvpqq qssliqtqqt esmatsvsss
-301 pslptspgdf adsnpfeerf pgggtspiis miprypvtsr pqtsdindkv nkylsklvdy
-361 fisnemksnk slpqvllhpp phsapyidap idpelhtafh wacsmgnlpi aealyeagts
-421 irstnsqgqt plmrsslfhn sytrrtfpri fqllhetvfd idsqsqtvih hivkrksttp
-481 savyyldvvl skikdfspqy rielllntqd kngdtalhia skngdvvffn tlvkmgaltt
-541 isnkegltan eimnqqyeqm miqngtnqhv nssntdlnih vntnnietkn dvnsmvimsp
-601 vspsdyityp sqiatnisrn ipnvvnsmkq masiyndlhe qhdneikslq ktlksisktk
-661 iqvslktlev lkesskdeng eaqtnddfei lsrlqeqntk klrkrliryk rlikqkleyr
-721 qtvllnklie detqattnnt vekdnntler lelaqeltml qlqrknklss lvkkfednak
-781 ihkyrriire gtemnieevd ssldvilqtl iannnknkga eqiitisnan sha
-//
-"

-mySeq     # "\n" means: line-break
+# =    1  A regex example  =====================================================

-mySeq <- gsub("[^a-zA-Z]", "", mySeq) # replace all non-letters with ""
+# The canonical FASTA version of yeast Mbp1 at Uniprot
+s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
+MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
+ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
+SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
+KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
+QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
+PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
+FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
+IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
+SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
+ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
+VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
+IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
+QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
+IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"

-mySeq
+nchar(s)
+# Must be 969

-# Now to change the sequence to upper-case. R has toupper()
-# and tolower().
+# Fetch the Uniprot ID by retrieving the first string that appears between two
+# vertical bars in the header line.
+#

-toupper(mySeq)
+# Develop the regular expression:
+                      # Just five characters returned, so we know we are using
+patt <- "^>(.{5})"    # the right functions
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]

-# CONTINUE ...
+patt <- "^>(.*)|"    # everything to the pipe character
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+
+# Ooops - "|" is a metacharacter - we must escape it
+
+patt <- "^>(.*)\|"    # using "\|"
+# Ooops - that's not how we escape: must double the \ to send a literal
+# "\" plus the character "|" to the regex engine.
+
+patt <- "^>(.*)\\|"    # using "\\|"
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+
+# Good. Now let's first match everything that is not a "|", then match a "|"
+patt <- "^>([^|]*)\\|"
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+
+# the same thing again, but capture the second match. And insist that there
+# must be at least one character captured
+
+patt <- "^>[^|]*\\|([^|]+)\\|"
+# Analyze this pattern:
+#    ^           anchor the match at the beginning of the line
+#    >           ">" must be the first character
+#    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because
+#                  we don't know what other versions of the string "sp"
+#                  might appear. Note that within the brackets "|" is NOT a
+#                  metacharacter.
+#    \\|         "|" character: ouside of square brackets "|" is a metacharacter
+#                  and means "OR"; we need to escape it to match a literal "|".
+#    (           open parenthesis: capture what comes next ...
+#       [^|]+    all-characters-except-a-vertical-bar, 1 or more times
+#    )           close parenthesis: stop capturing here
+#    \\|           second "|" character, escaped
+regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
+
+
+# =    2  Counting lines  ======================================================
+
+# Task: Write a function that returns the number of atoms in a PDB file. Call it
+#       atomCount(). Sample data is here:
+myPDB <- readLines("./data/0TST.pdb")
+
+#       Specification:
+#       Read a file from its path given as the only argument.
+#       Return the number of lines in that file that begin with "ATOM  "
+#       or with "HETATM".
+
+#       Try this. Solution code is at the end of this file. Don't peek.
+
+atomCount("./data/0TST.pdb")  # must return 6



+# ==   2.1  Counting C-alpha atoms only  =======================================
+
+# Task: write a function based on the previous one that matches only CA records,
+#       i.e. it can be used to count the number of amino acids. Don't get
+#       fooled by calcium atoms, or the string CA appearing elsewhere.
+#       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
+
+#       Specification:
+#       Read a file from its path given as the only argument.
+#       Return the number of lines in that file that have a C-alpha atom.
+
+#       Try this. Solution code is at the end of this file. Don't peek.
+
+CAcount("./data/0TST.pdb")  # must return 1
+
+
+# =    3  Code Solutions  ======================================================
+
+# ==   3.1  Counting atoms  ====================================================
+
+atomCount <- function(IN) {
+  # count the number of atoms in a PDB formatted file
+  # Parameters:
+  #     IN  chr  path of the file to read
+  # Value:
+  #         numeric  number of lines that match "^ATOM  " or "^HETATM"
+  x <- readLines(IN)
+  patt <- "(^ATOM  )|(^HETATM)"
+  return(length(grep(patt, x)))
+}
+
+
+# ==   3.2  Counting C-alpha records  ==========================================
+
+
+CAcount <- function(IN) {
+  # count the number of C-alpha atoms in a PDB formatted file
+  # Parameters:
+  #     IN  chr  path of the file to read
+  # Value:
+  #         numeric  number of lines that match " CA " in position 13 - 16 of
+  #                  an ATOM record.
+  x <- readLines(IN)
+  patt <- "^ATOM  ...... CA "
+  return(length(grep(patt, x)))
+}



--- a/data/0TST.pdb
+++ b/data/0TST.pdb
@ -0,0 +1,10 @@
+HEADER   TEST                                                 0TST      0TST   1
+REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2
+ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3
+ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4
+ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5
+ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6
+TER       5      GLY     1                                              0TST   7
+HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8
+HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9
+END                                                                     0TST  10