New unit - regex practice and sample file
This commit is contained in:
parent
64dfed2daf
commit
38e5822ac7
168
RPR-RegEx.R
168
RPR-RegEx.R
@ -16,51 +16,161 @@
|
|||||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||||
#
|
#
|
||||||
# DO NOT SIMPLY source() THESE FILES!
|
# DO NOT SIMPLY source() THESE FILES!
|
||||||
|
#
|
||||||
# If there are portions you don't understand, use R's help system, Google for an
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
# going on. That's not how it works ...
|
# going on. That's not how it works ...
|
||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# ==============================================================================
|
#TOC> ==========================================================================
|
||||||
# PART ONE: REGEX EXAMPLE
|
#TOC>
|
||||||
# ==============================================================================
|
#TOC> Section Title Line
|
||||||
|
#TOC> ----------------------------------------------
|
||||||
|
#TOC> 1 A regex example 44
|
||||||
|
#TOC> 2 Counting lines 111
|
||||||
|
#TOC> 2.1 Counting C-alpha atoms only 128
|
||||||
|
#TOC> 3 Code Solutions 144
|
||||||
|
#TOC> 3.1 Counting atoms 146
|
||||||
|
#TOC> 3.2 Counting C-alpha records 160
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC>
|
||||||
|
#TOC>
|
||||||
|
#TOC>
|
||||||
|
|
||||||
# The Mbp1 sequence as copied from the NCBI Website
|
|
||||||
mySeq <- "
|
|
||||||
1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk
|
|
||||||
61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha
|
|
||||||
121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr
|
|
||||||
181 klgvnlqrsq sdmgfprpai pnssisttql psirstmgpq sptlgileee rhdsrqqqpq
|
|
||||||
241 qnnsaqfkei dledglssdv epsqqlqqvf nqntgfvpqq qssliqtqqt esmatsvsss
|
|
||||||
301 pslptspgdf adsnpfeerf pgggtspiis miprypvtsr pqtsdindkv nkylsklvdy
|
|
||||||
361 fisnemksnk slpqvllhpp phsapyidap idpelhtafh wacsmgnlpi aealyeagts
|
|
||||||
421 irstnsqgqt plmrsslfhn sytrrtfpri fqllhetvfd idsqsqtvih hivkrksttp
|
|
||||||
481 savyyldvvl skikdfspqy rielllntqd kngdtalhia skngdvvffn tlvkmgaltt
|
|
||||||
541 isnkegltan eimnqqyeqm miqngtnqhv nssntdlnih vntnnietkn dvnsmvimsp
|
|
||||||
601 vspsdyityp sqiatnisrn ipnvvnsmkq masiyndlhe qhdneikslq ktlksisktk
|
|
||||||
661 iqvslktlev lkesskdeng eaqtnddfei lsrlqeqntk klrkrliryk rlikqkleyr
|
|
||||||
721 qtvllnklie detqattnnt vekdnntler lelaqeltml qlqrknklss lvkkfednak
|
|
||||||
781 ihkyrriire gtemnieevd ssldvilqtl iannnknkga eqiitisnan sha
|
|
||||||
//
|
|
||||||
"
|
|
||||||
|
|
||||||
mySeq # "\n" means: line-break
|
# = 1 A regex example =====================================================
|
||||||
|
|
||||||
mySeq <- gsub("[^a-zA-Z]", "", mySeq) # replace all non-letters with ""
|
# The canonical FASTA version of yeast Mbp1 at Uniprot
|
||||||
|
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
|
||||||
|
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
|
||||||
|
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
|
||||||
|
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
|
||||||
|
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
|
||||||
|
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
|
||||||
|
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
|
||||||
|
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
|
||||||
|
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
|
||||||
|
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
|
||||||
|
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
|
||||||
|
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
|
||||||
|
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
|
||||||
|
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
|
||||||
|
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
|
||||||
|
|
||||||
mySeq
|
nchar(s)
|
||||||
|
# Must be 969
|
||||||
|
|
||||||
# Now to change the sequence to upper-case. R has toupper()
|
# Fetch the Uniprot ID by retrieving the first string that appears between two
|
||||||
# and tolower().
|
# vertical bars in the header line.
|
||||||
|
#
|
||||||
|
|
||||||
toupper(mySeq)
|
# Develop the regular expression:
|
||||||
|
# Just five characters returned, so we know we are using
|
||||||
|
patt <- "^>(.{5})" # the right functions
|
||||||
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||||
|
|
||||||
# CONTINUE ...
|
patt <- "^>(.*)|" # everything to the pipe character
|
||||||
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||||
|
|
||||||
|
# Ooops - "|" is a metacharacter - we must escape it
|
||||||
|
|
||||||
|
patt <- "^>(.*)\|" # using "\|"
|
||||||
|
# Ooops - that's not how we escape: must double the \ to send a literal
|
||||||
|
# "\" plus the character "|" to the regex engine.
|
||||||
|
|
||||||
|
patt <- "^>(.*)\\|" # using "\\|"
|
||||||
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||||
|
|
||||||
|
# Good. Now let's first match everything that is not a "|", then match a "|"
|
||||||
|
patt <- "^>([^|]*)\\|"
|
||||||
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||||
|
|
||||||
|
# the same thing again, but capture the second match. And insist that there
|
||||||
|
# must be at least one character captured
|
||||||
|
|
||||||
|
patt <- "^>[^|]*\\|([^|]+)\\|"
|
||||||
|
# Analyze this pattern:
|
||||||
|
# ^ anchor the match at the beginning of the line
|
||||||
|
# > ">" must be the first character
|
||||||
|
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because
|
||||||
|
# we don't know what other versions of the string "sp"
|
||||||
|
# might appear. Note that within the brackets "|" is NOT a
|
||||||
|
# metacharacter.
|
||||||
|
# \\| "|" character: ouside of square brackets "|" is a metacharacter
|
||||||
|
# and means "OR"; we need to escape it to match a literal "|".
|
||||||
|
# ( open parenthesis: capture what comes next ...
|
||||||
|
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times
|
||||||
|
# ) close parenthesis: stop capturing here
|
||||||
|
# \\| second "|" character, escaped
|
||||||
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
||||||
|
|
||||||
|
|
||||||
|
# = 2 Counting lines ======================================================
|
||||||
|
|
||||||
|
# Task: Write a function that returns the number of atoms in a PDB file. Call it
|
||||||
|
# atomCount(). Sample data is here:
|
||||||
|
myPDB <- readLines("./data/0TST.pdb")
|
||||||
|
|
||||||
|
# Specification:
|
||||||
|
# Read a file from its path given as the only argument.
|
||||||
|
# Return the number of lines in that file that begin with "ATOM "
|
||||||
|
# or with "HETATM".
|
||||||
|
|
||||||
|
# Try this. Solution code is at the end of this file. Don't peek.
|
||||||
|
|
||||||
|
atomCount("./data/0TST.pdb") # must return 6
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.1 Counting C-alpha atoms only =======================================
|
||||||
|
|
||||||
|
# Task: write a function based on the previous one that matches only CA records,
|
||||||
|
# i.e. it can be used to count the number of amino acids. Don't get
|
||||||
|
# fooled by calcium atoms, or the string CA appearing elsewhere.
|
||||||
|
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
|
||||||
|
|
||||||
|
# Specification:
|
||||||
|
# Read a file from its path given as the only argument.
|
||||||
|
# Return the number of lines in that file that have a C-alpha atom.
|
||||||
|
|
||||||
|
# Try this. Solution code is at the end of this file. Don't peek.
|
||||||
|
|
||||||
|
CAcount("./data/0TST.pdb") # must return 1
|
||||||
|
|
||||||
|
|
||||||
|
# = 3 Code Solutions ======================================================
|
||||||
|
|
||||||
|
# == 3.1 Counting atoms ====================================================
|
||||||
|
|
||||||
|
atomCount <- function(IN) {
|
||||||
|
# count the number of atoms in a PDB formatted file
|
||||||
|
# Parameters:
|
||||||
|
# IN chr path of the file to read
|
||||||
|
# Value:
|
||||||
|
# numeric number of lines that match "^ATOM " or "^HETATM"
|
||||||
|
x <- readLines(IN)
|
||||||
|
patt <- "(^ATOM )|(^HETATM)"
|
||||||
|
return(length(grep(patt, x)))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 3.2 Counting C-alpha records ==========================================
|
||||||
|
|
||||||
|
|
||||||
|
CAcount <- function(IN) {
|
||||||
|
# count the number of C-alpha atoms in a PDB formatted file
|
||||||
|
# Parameters:
|
||||||
|
# IN chr path of the file to read
|
||||||
|
# Value:
|
||||||
|
# numeric number of lines that match " CA " in position 13 - 16 of
|
||||||
|
# an ATOM record.
|
||||||
|
x <- readLines(IN)
|
||||||
|
patt <- "^ATOM ...... CA "
|
||||||
|
return(length(grep(patt, x)))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
10
data/0TST.pdb
Normal file
10
data/0TST.pdb
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
HEADER TEST 0TST 0TST 1
|
||||||
|
REMARK A CATALOGUE OF ATOM AND HETATM RECORDS 0TST 2
|
||||||
|
ATOM 1 N GLY 1 -6.253 75.745 53.559 1.00 36.34 0TST 3
|
||||||
|
ATOM 2 CA GLY 1 -5.789 75.223 52.264 1.00 44.94 0TST 4
|
||||||
|
ATOM 3 C GLY 1 -5.592 73.702 52.294 1.00 32.28 0TST 5
|
||||||
|
ATOM 4 O GLY 1 -5.140 73.148 53.304 1.00 19.32 0TST 6
|
||||||
|
TER 5 GLY 1 0TST 7
|
||||||
|
HETATM 6 O HOH 1 -4.169 60.050 40.145 1.00 3.00 0TST 8
|
||||||
|
HETATM 7 CA CA 1 -1.258 -71.579 50.253 1.00 3.00 0TST 9
|
||||||
|
END 0TST 10
|
Loading…
x
Reference in New Issue
Block a user