181 lines
6.7 KiB
R
181 lines
6.7 KiB
R
# tocID <- "RPR-RegEx.R"
|
|
#
|
|
# Purpose: A Bioinformatics Course:
|
|
# R code accompanying the RPR-RegEx unit
|
|
#
|
|
# Version: 1.0
|
|
#
|
|
# Date: 2017-08 - 2020-09
|
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
|
#
|
|
# V 0.1 Maintenance 2020
|
|
# V 0.1 First code
|
|
#
|
|
# TODO:
|
|
#
|
|
#
|
|
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
|
#
|
|
# DO NOT SIMPLY source() THESE FILES!
|
|
#
|
|
# If there are portions you don't understand, use R's help system, Google for an
|
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
|
# going on. That's not how it works ...
|
|
#
|
|
# ==============================================================================
|
|
|
|
|
|
#TOC> ==========================================================================
|
|
#TOC>
|
|
#TOC> Section Title Line
|
|
#TOC> ----------------------------------------------------
|
|
#TOC> 1 A regex example 41
|
|
#TOC> 2 Counting lines 108
|
|
#TOC> 2.1 Counting C-alpha atoms only 126
|
|
#TOC> 3 Code Solutions 142
|
|
#TOC> 3.1 Counting atoms 144
|
|
#TOC> 3.2 Counting C-alpha records 160
|
|
#TOC>
|
|
#TOC> ==========================================================================
|
|
|
|
|
|
# = 1 A regex example =====================================================
|
|
|
|
# The canonical FASTA version of yeast Mbp1 at Uniprot
|
|
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
|
|
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
|
|
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
|
|
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
|
|
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
|
|
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
|
|
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
|
|
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
|
|
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
|
|
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
|
|
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
|
|
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
|
|
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
|
|
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
|
|
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
|
|
|
|
nchar(s)
|
|
# Must be 969
|
|
|
|
# Task: Fetch the Uniprot ID by retrieving the first string that appears between
|
|
# two vertical bars ("pipes") in the header record.
|
|
#
|
|
|
|
# Develop the regular expression:
|
|
# Just five characters returned, so we know we are using
|
|
patt <- "^>(.{5})" # the right functions
|
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
|
|
|
patt <- "^>(.*)|" # everything to the pipe character
|
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
|
|
|
# Ooops - "|" is a metacharacter - we must escape it
|
|
|
|
patt <- "^>(.*)\|" # using "\|"
|
|
# Ooops - that's not how we escape: must double the \ to send a literal
|
|
# "\" plus the character "|" to the regex engine.
|
|
|
|
patt <- "^>(.*)\\|" # using "\\|"
|
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
|
|
|
# Good. Now let's first match everything that is not a "|", then match a "|"
|
|
patt <- "^>([^|]*)\\|"
|
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
|
|
|
# the same thing again, but capture the second match. And insist that there
|
|
# must be at least one character captured
|
|
|
|
patt <- "^>[^|]*\\|([^|]+)\\|"
|
|
# Analyze this pattern:
|
|
# ^ anchor the match at the beginning of the line
|
|
# > ">" must be the first character
|
|
# [^|]* all-characters-except-a-vertical-bar, 0 or more times because
|
|
# we don't know what other versions of the string "sp"
|
|
# might appear. Note that within the brackets "|" is NOT a
|
|
# metacharacter.
|
|
# \\| "|" character: ouside of square brackets "|" is a metacharacter
|
|
# and means "OR"; we need to escape it to match a literal "|".
|
|
# ( open parenthesis: capture what comes next ...
|
|
# [^|]+ all-characters-except-a-vertical-bar, 1 or more times
|
|
# ) close parenthesis: stop capturing here
|
|
# \\| second "|" character, escaped
|
|
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
|
|
|
|
|
|
# = 2 Counting lines ======================================================
|
|
|
|
# Task: Write a function that returns the number of atoms in a PDB file. Call it
|
|
# atomCount(). Sample data is here:
|
|
myPDB <- readLines("./data/0TST.pdb")
|
|
|
|
# Specification:
|
|
# Read a file from its path given as the only argument.
|
|
# Return the number of lines in that file that begin with "ATOM "
|
|
# or with "HETATM".
|
|
|
|
# Try this. Write a function. Solution code is at the end of this file.
|
|
# Don't peek.
|
|
|
|
atomCount("./data/0TST.pdb") # must return 6
|
|
|
|
|
|
|
|
# == 2.1 Counting C-alpha atoms only =======================================
|
|
|
|
# Task: write a function based on the previous one that matches only CA records,
|
|
# i.e. it can be used to count the number of amino acids. Don't get
|
|
# fooled by calcium atoms, or the string CA appearing elsewhere.
|
|
# cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
|
|
|
|
# Specification:
|
|
# Read a file from its path given as the only argument.
|
|
# Return the number of lines in that file that have a C-alpha atom.
|
|
|
|
# Try this. Solution code is at the end of this file. Don't peek.
|
|
|
|
CAcount("./data/0TST.pdb") # must return 1
|
|
|
|
|
|
# = 3 Code Solutions ======================================================
|
|
|
|
# == 3.1 Counting atoms ====================================================
|
|
|
|
atomCount <- function(IN) {
|
|
# count the number of atoms in a PDB formatted file
|
|
# Parameters:
|
|
# IN chr path of the file to read
|
|
# Value:
|
|
# numeric number of lines that match "^ATOM " or "^HETATM"
|
|
# Note: the regex MUST be anchored to the beginning of the line, otherwise
|
|
# it might match somewhere in a comment!
|
|
x <- readLines(IN)
|
|
patt <- "(^ATOM )|(^HETATM)"
|
|
return(length(grep(patt, x)))
|
|
}
|
|
|
|
|
|
# == 3.2 Counting C-alpha records ==========================================
|
|
|
|
|
|
CAcount <- function(IN) {
|
|
# count the number of C-alpha atoms in a PDB formatted file
|
|
# Parameters:
|
|
# IN chr path of the file to read
|
|
# Value:
|
|
# numeric number of lines that match " CA " in position 13 - 16 of
|
|
# an ATOM record.
|
|
# Note: the regex MUST be aligned into the right position, otherwise it
|
|
# might match Calcium records!
|
|
x <- readLines(IN)
|
|
patt <- "^ATOM ...... CA "
|
|
return(length(grep(patt, x)))
|
|
}
|
|
|
|
|
|
|
|
# [END]
|