New unit - regex practice and sample file

This commit is contained in:
hyginn 2017-10-02 14:51:11 -04:00
parent 64dfed2daf
commit 38e5822ac7
2 changed files with 149 additions and 29 deletions

View File

@ -16,51 +16,161 @@
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
# ==============================================================================
# PART ONE: REGEX EXAMPLE
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------
#TOC> 1 A regex example 44
#TOC> 2 Counting lines 111
#TOC> 2.1 Counting C-alpha atoms only 128
#TOC> 3 Code Solutions 144
#TOC> 3.1 Counting atoms 146
#TOC> 3.2 Counting C-alpha records 160
#TOC>
#TOC> ==========================================================================
#TOC>
#TOC>
#TOC>
#TOC>
# The Mbp1 sequence as copied from the NCBI Website
mySeq <- "
1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk
61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha
121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr
181 klgvnlqrsq sdmgfprpai pnssisttql psirstmgpq sptlgileee rhdsrqqqpq
241 qnnsaqfkei dledglssdv epsqqlqqvf nqntgfvpqq qssliqtqqt esmatsvsss
301 pslptspgdf adsnpfeerf pgggtspiis miprypvtsr pqtsdindkv nkylsklvdy
361 fisnemksnk slpqvllhpp phsapyidap idpelhtafh wacsmgnlpi aealyeagts
421 irstnsqgqt plmrsslfhn sytrrtfpri fqllhetvfd idsqsqtvih hivkrksttp
481 savyyldvvl skikdfspqy rielllntqd kngdtalhia skngdvvffn tlvkmgaltt
541 isnkegltan eimnqqyeqm miqngtnqhv nssntdlnih vntnnietkn dvnsmvimsp
601 vspsdyityp sqiatnisrn ipnvvnsmkq masiyndlhe qhdneikslq ktlksisktk
661 iqvslktlev lkesskdeng eaqtnddfei lsrlqeqntk klrkrliryk rlikqkleyr
721 qtvllnklie detqattnnt vekdnntler lelaqeltml qlqrknklss lvkkfednak
781 ihkyrriire gtemnieevd ssldvilqtl iannnknkga eqiitisnan sha
//
"
mySeq # "\n" means: line-break
# = 1 A regex example =====================================================
mySeq <- gsub("[^a-zA-Z]", "", mySeq) # replace all non-letters with ""
# The canonical FASTA version of yeast Mbp1 at Uniprot
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
mySeq
nchar(s)
# Must be 969
# Now to change the sequence to upper-case. R has toupper()
# and tolower().
# Fetch the Uniprot ID by retrieving the first string that appears between two
# vertical bars in the header line.
#
toupper(mySeq)
# Develop the regular expression:
# Just five characters returned, so we know we are using
patt <- "^>(.{5})" # the right functions
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# CONTINUE ...
patt <- "^>(.*)|" # everything to the pipe character
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Ooops - "|" is a metacharacter - we must escape it
patt <- "^>(.*)\|" # using "\|"
# Ooops - that's not how we escape: must double the \ to send a literal
# "\" plus the character "|" to the regex engine.
patt <- "^>(.*)\\|" # using "\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Good. Now let's first match everything that is not a "|", then match a "|"
patt <- "^>([^|]*)\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# the same thing again, but capture the second match. And insist that there
# must be at least one character captured
patt <- "^>[^|]*\\|([^|]+)\\|"
# Analyze this pattern:
# ^ anchor the match at the beginning of the line
# > ">" must be the first character
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because
# we don't know what other versions of the string "sp"
# might appear. Note that within the brackets "|" is NOT a
# metacharacter.
# \\| "|" character: ouside of square brackets "|" is a metacharacter
# and means "OR"; we need to escape it to match a literal "|".
# ( open parenthesis: capture what comes next ...
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times
# ) close parenthesis: stop capturing here
# \\| second "|" character, escaped
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# = 2 Counting lines ======================================================
# Task: Write a function that returns the number of atoms in a PDB file. Call it
# atomCount(). Sample data is here:
myPDB <- readLines("./data/0TST.pdb")
# Specification:
# Read a file from its path given as the only argument.
# Return the number of lines in that file that begin with "ATOM "
# or with "HETATM".
# Try this. Solution code is at the end of this file. Don't peek.
atomCount("./data/0TST.pdb") # must return 6
# == 2.1 Counting C-alpha atoms only =======================================
# Task: write a function based on the previous one that matches only CA records,
# i.e. it can be used to count the number of amino acids. Don't get
# fooled by calcium atoms, or the string CA appearing elsewhere.
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
# Specification:
# Read a file from its path given as the only argument.
# Return the number of lines in that file that have a C-alpha atom.
# Try this. Solution code is at the end of this file. Don't peek.
CAcount("./data/0TST.pdb") # must return 1
# = 3 Code Solutions ======================================================
# == 3.1 Counting atoms ====================================================
atomCount <- function(IN) {
# count the number of atoms in a PDB formatted file
# Parameters:
# IN chr path of the file to read
# Value:
# numeric number of lines that match "^ATOM " or "^HETATM"
x <- readLines(IN)
patt <- "(^ATOM )|(^HETATM)"
return(length(grep(patt, x)))
}
# == 3.2 Counting C-alpha records ==========================================
CAcount <- function(IN) {
# count the number of C-alpha atoms in a PDB formatted file
# Parameters:
# IN chr path of the file to read
# Value:
# numeric number of lines that match " CA " in position 13 - 16 of
# an ATOM record.
x <- readLines(IN)
patt <- "^ATOM ...... CA "
return(length(grep(patt, x)))
}

10
data/0TST.pdb Normal file
View File

@ -0,0 +1,10 @@
HEADER TEST 0TST 0TST 1
REMARK A CATALOGUE OF ATOM AND HETATM RECORDS 0TST 2
ATOM 1 N GLY 1 -6.253 75.745 53.559 1.00 36.34 0TST 3
ATOM 2 CA GLY 1 -5.789 75.223 52.264 1.00 44.94 0TST 4
ATOM 3 C GLY 1 -5.592 73.702 52.294 1.00 32.28 0TST 5
ATOM 4 O GLY 1 -5.140 73.148 53.304 1.00 19.32 0TST 6
TER 5 GLY 1 0TST 7
HETATM 6 O HOH 1 -4.169 60.050 40.145 1.00 3.00 0TST 8
HETATM 7 CA CA 1 -1.258 -71.579 50.253 1.00 3.00 0TST 9
END 0TST 10