New unit - regex practice and sample file
This commit is contained in:
parent
64dfed2daf
commit
38e5822ac7
168
RPR-RegEx.R
168
RPR-RegEx.R
@ -16,51 +16,161 @@
|
||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||
#
|
||||
# DO NOT SIMPLY source() THESE FILES!
|
||||
|
||||
#
|
||||
# If there are portions you don't understand, use R's help system, Google for an
|
||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||
# going on. That's not how it works ...
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# ==============================================================================
|
||||
# PART ONE: REGEX EXAMPLE
|
||||
# ==============================================================================
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------
|
||||
#TOC> 1 A regex example 44
|
||||
#TOC> 2 Counting lines 111
|
||||
#TOC> 2.1 Counting C-alpha atoms only 128
|
||||
#TOC> 3 Code Solutions 144
|
||||
#TOC> 3.1 Counting atoms 146
|
||||
#TOC> 3.2 Counting C-alpha records 160
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC>
|
||||
#TOC>
|
||||
#TOC>
|
||||
|
||||
# The Mbp1 sequence as copied from the NCBI Website
|
||||
mySeq <- "
|
||||
1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk
|
||||
61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha
|
||||
121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr
|
||||
181 klgvnlqrsq sdmgfprpai pnssisttql psirstmgpq sptlgileee rhdsrqqqpq
|
||||
241 qnnsaqfkei dledglssdv epsqqlqqvf nqntgfvpqq qssliqtqqt esmatsvsss
|
||||
301 pslptspgdf adsnpfeerf pgggtspiis miprypvtsr pqtsdindkv nkylsklvdy
|
||||
361 fisnemksnk slpqvllhpp phsapyidap idpelhtafh wacsmgnlpi aealyeagts
|
||||
421 irstnsqgqt plmrsslfhn sytrrtfpri fqllhetvfd idsqsqtvih hivkrksttp
|
||||
481 savyyldvvl skikdfspqy rielllntqd kngdtalhia skngdvvffn tlvkmgaltt
|
||||
541 isnkegltan eimnqqyeqm miqngtnqhv nssntdlnih vntnnietkn dvnsmvimsp
|
||||
601 vspsdyityp sqiatnisrn ipnvvnsmkq masiyndlhe qhdneikslq ktlksisktk
|
||||
661 iqvslktlev lkesskdeng eaqtnddfei lsrlqeqntk klrkrliryk rlikqkleyr
|
||||
721 qtvllnklie detqattnnt vekdnntler lelaqeltml qlqrknklss lvkkfednak
|
||||
781 ihkyrriire gtemnieevd ssldvilqtl iannnknkga eqiitisnan sha
|
||||
//
|
||||
"
|
||||
|
||||
mySeq # "\n" means: line-break
|
||||
# = 1 A regex example =====================================================
|
||||
|
||||
mySeq <- gsub("[^a-zA-Z]", "", mySeq) # replace all non-letters with ""
|
||||
# The canonical FASTA version of yeast Mbp1 at Uniprot
|
||||
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
|
||||
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
|
||||
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
|
||||
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
|
||||
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
|
||||
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
|
||||
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
|
||||
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
|
||||
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
|
||||
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
|
||||
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
|
||||
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
|
||||
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
|
||||
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
|
||||
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
|
||||
|
||||
mySeq
|
||||
nchar(s)
|
||||
# Must be 969
|
||||
|
||||
# Now to change the sequence to upper-case. R has toupper()
|
||||
# and tolower().
|
||||
# Fetch the Uniprot ID by retrieving the first string that appears between two
|
||||
# vertical bars in the header line.
|
||||
#
|
||||
|
||||
toupper(mySeq)
|
||||
# Develop the regular expression:
|
||||
# Just five characters returned, so we know we are using
|
||||
patt <- "^>(.{5})" # the right functions
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# CONTINUE ...
|
||||
patt <- "^>(.*)|" # everything to the pipe character
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# Ooops - "|" is a metacharacter - we must escape it
|
||||
|
||||
patt <- "^>(.*)\|" # using "\|"
|
||||
# Ooops - that's not how we escape: must double the \ to send a literal
|
||||
# "\" plus the character "|" to the regex engine.
|
||||
|
||||
patt <- "^>(.*)\\|" # using "\\|"
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# Good. Now let's first match everything that is not a "|", then match a "|"
|
||||
patt <- "^>([^|]*)\\|"
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
# the same thing again, but capture the second match. And insist that there
|
||||
# must be at least one character captured
|
||||
|
||||
patt <- "^>[^|]*\\|([^|]+)\\|"
|
||||
# Analyze this pattern:
|
||||
# ^ anchor the match at the beginning of the line
|
||||
# > ">" must be the first character
|
||||
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because
|
||||
# we don't know what other versions of the string "sp"
|
||||
# might appear. Note that within the brackets "|" is NOT a
|
||||
# metacharacter.
|
||||
# \\| "|" character: ouside of square brackets "|" is a metacharacter
|
||||
# and means "OR"; we need to escape it to match a literal "|".
|
||||
# ( open parenthesis: capture what comes next ...
|
||||
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times
|
||||
# ) close parenthesis: stop capturing here
|
||||
# \\| second "|" character, escaped
|
||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||
|
||||
|
||||
# = 2 Counting lines ======================================================
|
||||
|
||||
# Task: Write a function that returns the number of atoms in a PDB file. Call it
|
||||
# atomCount(). Sample data is here:
|
||||
myPDB <- readLines("./data/0TST.pdb")
|
||||
|
||||
# Specification:
|
||||
# Read a file from its path given as the only argument.
|
||||
# Return the number of lines in that file that begin with "ATOM "
|
||||
# or with "HETATM".
|
||||
|
||||
# Try this. Solution code is at the end of this file. Don't peek.
|
||||
|
||||
atomCount("./data/0TST.pdb") # must return 6
|
||||
|
||||
|
||||
|
||||
# == 2.1 Counting C-alpha atoms only =======================================
|
||||
|
||||
# Task: write a function based on the previous one that matches only CA records,
|
||||
# i.e. it can be used to count the number of amino acids. Don't get
|
||||
# fooled by calcium atoms, or the string CA appearing elsewhere.
|
||||
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
|
||||
|
||||
# Specification:
|
||||
# Read a file from its path given as the only argument.
|
||||
# Return the number of lines in that file that have a C-alpha atom.
|
||||
|
||||
# Try this. Solution code is at the end of this file. Don't peek.
|
||||
|
||||
CAcount("./data/0TST.pdb") # must return 1
|
||||
|
||||
|
||||
# = 3 Code Solutions ======================================================
|
||||
|
||||
# == 3.1 Counting atoms ====================================================
|
||||
|
||||
atomCount <- function(IN) {
|
||||
# count the number of atoms in a PDB formatted file
|
||||
# Parameters:
|
||||
# IN chr path of the file to read
|
||||
# Value:
|
||||
# numeric number of lines that match "^ATOM " or "^HETATM"
|
||||
x <- readLines(IN)
|
||||
patt <- "(^ATOM )|(^HETATM)"
|
||||
return(length(grep(patt, x)))
|
||||
}
|
||||
|
||||
|
||||
# == 3.2 Counting C-alpha records ==========================================
|
||||
|
||||
|
||||
CAcount <- function(IN) {
|
||||
# count the number of C-alpha atoms in a PDB formatted file
|
||||
# Parameters:
|
||||
# IN chr path of the file to read
|
||||
# Value:
|
||||
# numeric number of lines that match " CA " in position 13 - 16 of
|
||||
# an ATOM record.
|
||||
x <- readLines(IN)
|
||||
patt <- "^ATOM ...... CA "
|
||||
return(length(grep(patt, x)))
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
10
data/0TST.pdb
Normal file
10
data/0TST.pdb
Normal file
@ -0,0 +1,10 @@
|
||||
HEADER TEST 0TST 0TST 1
|
||||
REMARK A CATALOGUE OF ATOM AND HETATM RECORDS 0TST 2
|
||||
ATOM 1 N GLY 1 -6.253 75.745 53.559 1.00 36.34 0TST 3
|
||||
ATOM 2 CA GLY 1 -5.789 75.223 52.264 1.00 44.94 0TST 4
|
||||
ATOM 3 C GLY 1 -5.592 73.702 52.294 1.00 32.28 0TST 5
|
||||
ATOM 4 O GLY 1 -5.140 73.148 53.304 1.00 19.32 0TST 6
|
||||
TER 5 GLY 1 0TST 7
|
||||
HETATM 6 O HOH 1 -4.169 60.050 40.145 1.00 3.00 0TST 8
|
||||
HETATM 7 CA CA 1 -1.258 -71.579 50.253 1.00 3.00 0TST 9
|
||||
END 0TST 10
|
Loading…
Reference in New Issue
Block a user