bch441-work-abc-units/RPR-RegEx.R
2020-09-22 21:02:13 +10:00

181 lines
6.7 KiB
R

# tocID <- "RPR-RegEx.R"
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-RegEx unit
#
# Version: 1.0
#
# Date: 2017-08 - 2020-09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 0.1 Maintenance 2020
# V 0.1 First code
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 A regex example 41
#TOC> 2 Counting lines 108
#TOC> 2.1 Counting C-alpha atoms only 126
#TOC> 3 Code Solutions 142
#TOC> 3.1 Counting atoms 144
#TOC> 3.2 Counting C-alpha records 160
#TOC>
#TOC> ==========================================================================
# = 1 A regex example =====================================================
# The canonical FASTA version of yeast Mbp1 at Uniprot
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
nchar(s)
# Must be 969
# Task: Fetch the Uniprot ID by retrieving the first string that appears between
# two vertical bars ("pipes") in the header record.
#
# Develop the regular expression:
# Just five characters returned, so we know we are using
patt <- "^>(.{5})" # the right functions
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
patt <- "^>(.*)|" # everything to the pipe character
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Ooops - "|" is a metacharacter - we must escape it
patt <- "^>(.*)\|" # using "\|"
# Ooops - that's not how we escape: must double the \ to send a literal
# "\" plus the character "|" to the regex engine.
patt <- "^>(.*)\\|" # using "\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# Good. Now let's first match everything that is not a "|", then match a "|"
patt <- "^>([^|]*)\\|"
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# the same thing again, but capture the second match. And insist that there
# must be at least one character captured
patt <- "^>[^|]*\\|([^|]+)\\|"
# Analyze this pattern:
# ^ anchor the match at the beginning of the line
# > ">" must be the first character
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because
# we don't know what other versions of the string "sp"
# might appear. Note that within the brackets "|" is NOT a
# metacharacter.
# \\| "|" character: ouside of square brackets "|" is a metacharacter
# and means "OR"; we need to escape it to match a literal "|".
# ( open parenthesis: capture what comes next ...
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times
# ) close parenthesis: stop capturing here
# \\| second "|" character, escaped
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
# = 2 Counting lines ======================================================
# Task: Write a function that returns the number of atoms in a PDB file. Call it
# atomCount(). Sample data is here:
myPDB <- readLines("./data/0TST.pdb")
# Specification:
# Read a file from its path given as the only argument.
# Return the number of lines in that file that begin with "ATOM "
# or with "HETATM".
# Try this. Write a function. Solution code is at the end of this file.
# Don't peek.
atomCount("./data/0TST.pdb") # must return 6
# == 2.1 Counting C-alpha atoms only =======================================
# Task: write a function based on the previous one that matches only CA records,
# i.e. it can be used to count the number of amino acids. Don't get
# fooled by calcium atoms, or the string CA appearing elsewhere.
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
# Specification:
# Read a file from its path given as the only argument.
# Return the number of lines in that file that have a C-alpha atom.
# Try this. Solution code is at the end of this file. Don't peek.
CAcount("./data/0TST.pdb") # must return 1
# = 3 Code Solutions ======================================================
# == 3.1 Counting atoms ====================================================
atomCount <- function(IN) {
# count the number of atoms in a PDB formatted file
# Parameters:
# IN chr path of the file to read
# Value:
# numeric number of lines that match "^ATOM " or "^HETATM"
# Note: the regex MUST be anchored to the beginning of the line, otherwise
# it might match somewhere in a comment!
x <- readLines(IN)
patt <- "(^ATOM )|(^HETATM)"
return(length(grep(patt, x)))
}
# == 3.2 Counting C-alpha records ==========================================
CAcount <- function(IN) {
# count the number of C-alpha atoms in a PDB formatted file
# Parameters:
# IN chr path of the file to read
# Value:
# numeric number of lines that match " CA " in position 13 - 16 of
# an ATOM record.
# Note: the regex MUST be aligned into the right position, otherwise it
# might match Calcium records!
x <- readLines(IN)
patt <- "^ATOM ...... CA "
return(length(grep(patt, x)))
}
# [END]