Added and updated learning units

This commit is contained in:
hyginn 2017-09-12 16:09:20 -04:00
parent f0440b0a39
commit 1bb04b8bec
25 changed files with 5563 additions and 17 deletions

18
.init.R
View File

@ -8,6 +8,24 @@ if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
file.copy(".tmp.R", "myScript.R")
}
# If it doesn't exist yet, set up a profile:
if (! file.exists(".myProfile.R")) {
# setup profile data
cat("\nPlease enter the requested values correctly, no spaces, and\n")
cat("press <enter>.\n")
e <- readline("Please enter your UofT eMail address: ")
n <- readline("Please enter your Student Number: ")
conn <- file(".myProfile.R")
writeLines(c(sprintf("myEMail <- \"%s\"", e),
sprintf("myStudentNumber <- %d", as.numeric(n))),
conn)
close(conn)
rm(e, n, conn)
}
source(".myProfile.R")
source(".utilities.R")
file.edit("ABC-units.R")

View File

@ -39,6 +39,63 @@ objectInfo <- function(x) {
# Done
}
# ====== Utilities
biCode <- function(s) {
# make a 5 character code from a binomial name by concatening
# the first three letter of the first word and the first
# two letters of the second word.
#
# There are many ways to do this, here we assemble the two parts
# in a loop, this way the function is vectorized and can
# work on a large vector of names.
b <- character()
for (i in 1:length(s)) {
b[i] <- sprintf("%s%s",
toupper(unlist(substr(s[i], 1, 3))),
toupper(unlist(substr(strsplit(s[i], "\\s+")[[1]][2],
1, 2))))
}
return(b)
}
pBar <- function(i, l, nCh = 50) {
# Draw a progress bar in the console
# i: the current iteration
# l: the total number of iterations
# nCh: width of the progress bar
ticks <- round(seq(1, l-1, length.out = nCh))
if (i < l) {
if (any(i == ticks)) {
p <- which(i == ticks)
p1 <- paste(rep("#", p), collapse = "")
p2 <- paste(rep("-", nCh - p), collapse = "")
cat(sprintf("\r|%s%s|", p1, p2))
flush.console()
}
}
else { # done
cat("\n")
}
}
# ====== DATA ==================================================================
# 10 species of fungi for reference analysis.
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
REFspecies <- c("Aspergillus nidulans",
"Bipolaris oryzae",
"Coprinopsis cinerea",
"Cryptococcus neoformans",
"Neurospora crassa",
"Puccinia graminis",
"Saccharomyces cerevisiae",
"Schizosaccharomyces pombe",
"Ustilago maydis",
"Wallemia mellicola"
)
# [END]

View File

@ -0,0 +1,100 @@
# addSACCE_APSESproteins.R
# Adds the Saccharomyces cerevisiae APSES proteins to myDB
#
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "SWI4_SACCE",
RefSeqID = "NP_011036",
UniProtID = "P25302",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
1081 klddiekdlr ana"),
stringsAsFactors = FALSE))
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "PHD1_SACCE",
RefSeqID = "NP_012881",
UniProtID = "P36093",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
361 aknels"),
stringsAsFactors = FALSE))
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "SOK2_SACCE",
RefSeqID = "NP_013729",
UniProtID = "P53438",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
781 kkqek"),
stringsAsFactors = FALSE))
myDB$protein <-
rbind(myDB$protein,
data.frame(
ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
name = "XBP1_SACCE",
RefSeqID = "NP_012165",
UniProtID = "P40489",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
stringsAsFactors = FALSE))
# [END]

410
ABC-create_refDB.R Normal file
View File

@ -0,0 +1,410 @@
# create_refDB.R
# Create a reference protein database for Mbp1-like proteins
#
# Boris Steipe for BCH441
#
# For the species, see:
# cf. http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
#
# For the schema, see dbInit() in .utilities.R
#
# ==============================================================================
refDB <- dbInit()
# === protein table ===
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_ASPNI",
RefSeqID = "XP_660758",
UniProtID = "Q5B8H6",
taxonomy.ID = as.integer(162425),
sequence = dbSanitizeSequence("
MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKVQGGYGKYQGT
WIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDVFSAVNHHRSMGPPSFHHEHY
DVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVAAMSLSEQEHILYGDQLLDYFMTVGDAPEAT
RIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDLLRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLL
LDTISFRDWFGATLFHHIAQTTKSKGKWKSSRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDL
LLSRCPRAGDLVNKRGETASSIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIG
AIMAEASRKLTSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE
QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKFDVHRKLVALA
TGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"),
stringsAsFactors = FALSE))
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_BIPOR",
RefSeqID = "XP_007682304",
UniProtID = "W6ZM86",
taxonomy.ID = as.integer(101162),
sequence = dbSanitizeSequence("
MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKVQGGYGKYQGT
WIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAAAVAAAAAAAAVANHNALMSN
SRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADGNRKRKRGMDQMSLLDQQHQIWADQLLDYFM
LLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDVGVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMP
SMVKIFQQTVHRTDWFGSTVFHHIAATTSSSNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGA
RKCVRSLLGRNVAVDIPNKKGETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVR
ESVQYRSQTASHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND
EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELVREVVGNLSVA
GMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"),
stringsAsFactors = FALSE))
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_NEUCR",
RefSeqID = "XP_955821",
UniProtID = "Q7RW59",
taxonomy.ID = as.integer(5141),
sequence = dbSanitizeSequence("
MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREVQKDTHEKIQG
GYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAVPTWGSKSAKNANPPQPGTFL
PPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFSTGHRKRKRDELIEDMTEQQHAVYGDELLDY
FLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMGDVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQT
FPQVMKELFSTIDCRDLSGCTVIHHAAVMKIGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAM
RDARKCIRALLGRGASTDIPNKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQP
NYSSDAANTVQNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE
MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDENETESEAEHP
DPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEARGMLGTGERIDKYKHLLMSC
LPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGGGQPSNGRRESVLPALRGGNGDGEMSRRGSR
TAAAAAAQVDGEREINGRAGAERTERIQEIAAV"),
stringsAsFactors = FALSE))
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_SACCE",
RefSeqID = "NP_010227",
UniProtID = "P39678",
taxonomy.ID = as.integer(4932),
sequence = dbSanitizeSequence("
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF
GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET
KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL
PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ
QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV
NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL
SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM
MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ
MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK
KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS
LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"),
stringsAsFactors = FALSE))
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_SCHPO", # actually the Res2 protein
RefSeqID = "NP_593032",
UniProtID = "P41412",
taxonomy.ID = as.integer(4896),
sequence = dbSanitizeSequence("
MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQGGYGKYQGTW
VPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSSSTLHSVNEKQPNSSISPTIE
SSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDKYEESLLDFFLHPEEGRIPSFLYSPPPDFQV
NSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRLSQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQS
IFHHIVQSTSTPSKVAAAKYYLDCILEKLISIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNR
QRRTASEYLLEADKKPHSLLQSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLI
RANRLKQDTLNEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS
DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLIAMSCGINPED
LSLEILDAVEEALTREK"),
stringsAsFactors = FALSE))
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_COPCI",
RefSeqID = "XP_001837394",
UniProtID = "A8NYC6",
taxonomy.ID = as.integer(5346),
sequence = dbSanitizeSequence("
MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGGYGKYQGTWIP
LERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVINTRSTRKQVADGVGEESDHDT
HSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRYRQSNDRYDEDDDASRHNGMGDPRSYGDQIL
EYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACAMGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDV
RKFPELYELLHRSTLNIDNSNRTVFHHVVDVAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARC
RSKRLVKLLIDHGADPKINNHDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLH
YSAAAQKASTRCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE
NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQVQQEEVSDLV
ELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAAGCGGLEPLEIDSVLGMLLET
LEAEDPSSTSATWSGSKGQQTG"),
stringsAsFactors = FALSE))
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_CRYNE",
RefSeqID = "XP_569090",
UniProtID = "Q5KMQ9",
taxonomy.ID = as.integer(5207),
sequence = dbSanitizeSequence("
MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQG
GYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKEKETGRTKATPSRTGPTSAAA
LQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVMTDQDMEVDKMGMHMSMPNVTLSQNMEELGA
GSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLGIGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFD
PNAPIDDDGHTALHWACAMGRVRVVKLLLTAGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNR
TVFHHIANLALTKGKTHAAKYYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRD
SRSAEDYILEDERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE
RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVREQRWENGELE
GNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQIAKYRKLVSAGLGGVSTNEV
DELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"),
stringsAsFactors = FALSE))
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_PUCGR",
RefSeqID = "XP_003327086",
UniProtID = "E3KED4",
taxonomy.ID = as.integer(5297),
sequence = dbSanitizeSequence("
MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETYHLLPSRSPPT
VSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREIQKGTHEKIQGGYGKYQG
TWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVKVSKVSAASAARAARAVVPSLPSTSGLGGRN
TNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSNLARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPEND
NSLTPSELSLPSRTPSPIEDLPLTVNTASSQSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPS
NAASAKYAKLILDYFVSESSQIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTAL
MRAVMFTNNHDLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE
DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPGGTSNRSDFVD
LVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERDLDHAVGLLSNIEKEYLEGQR
KILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERARRITEQRSKYLQELSIEDRKLLDSSNLRFAD
PSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRESVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSAN
TSRLNNYRKLISLGCGGIGLDEVDEVIESLNEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"),
stringsAsFactors = FALSE))
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_USTMA",
RefSeqID = "XP_011392621",
UniProtID = "A0A0D1DP35",
taxonomy.ID = as.integer(5270),
sequence = dbSanitizeSequence("
MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQGGYGKYQGTWI
PLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRSRRATSIETESEVIGAAPNNV
SEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARYADIILDYFVTENTTVPSLLINPPPDFNPDM
SIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQTALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVF
HHVVDLALSRGKPHAARYYMETMINRLADYGDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKN
AEDYIIEDERFRSSPSRTGPAGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQA
HGLLKQIQTEIEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS
TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTMAAYRRLIAAG
CGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGAP"),
stringsAsFactors = FALSE))
refDB$protein <-
rbind(refDB$protein,
data.frame(
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
name = "MBP1_WALME",
RefSeqID = "XP_006957051",
UniProtID = "I4YGC0",
taxonomy.ID = as.integer(1708541),
sequence = dbSanitizeSequence("
MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGGYGKYQGTWIP
MERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSKVFHPLSSTKHPAKLAAATNA
KAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAIDGSISYEDIILDYFISESTQIPALLIHPPSD
FNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRVNHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKH
DRTVLHHIVDLALTKSKTHAARYYMECVLSKLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPN
KDGKTAEDYILEDERFRQSPLLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKER
DYQQAQVILRNIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE
ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENANKKAALASGI
SGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"),
stringsAsFactors = FALSE))
# === taxonomy table ===
refDB$taxonomy <-
rbind(refDB$taxonomy,
data.frame(
ID = as.integer(c(162425,
101162,
5141,
4932,
4896,
5346,
5207,
5297,
5270,
1708541)),
species = c("Aspergillus nidulans",
"Bipolaris oryzae",
"Neurospora crassa",
"Saccharomyces cerevisiae",
"Schizosaccharomyces pombe",
"Coprinopsis cinerea",
"Cryptococcus neoformans",
"Puccinia Graminis",
"Ustilago maydis",
"Wallemia mellicola"),
stringsAsFactors = FALSE))
# === feature table ===
refDB$feature <-
rbind(refDB$feature,
data.frame(
ID = c("ref_ftr_1",
"ref_ftr_2",
"ref_ftr_3",
"ref_ftr_4",
"ref_ftr_5",
"ref_ftr_6",
"ref_ftr_7",
"ref_ftr_8"),
name = c("APSES fold",
"KilA-N",
"AT hook",
"low complexity",
"Ankyrin",
"Swi6 fold",
"coiled coil",
"McInerny 2011"),
type.ID = rep("ref_typ_1", 8),
description = c("DNA binding domain by similarity to structure",
"DNA binding domain by Pfam annotation",
"DNA interaction motif by SMART annotation",
"SEG annotation by SMART",
"Ankyrin domain by SMART annotation",
"Swi6 fold by similarity to structure",
"Coiled coil by SMART annotation",
"Yeast cell cycle review"),
sourceDB = c("PDB",
"Pfam",
"SMART",
"SMART",
"SMART",
"PDB",
"SMART",
"PubMed"),
accession = c("1BM8_A_1_99",
"PF04383",
NA,
NA,
"SM00248",
"1SW6_B",
NA,
NA),
stringsAsFactors = FALSE))
# === protein annotation table ===
# there are many! This, we don't code explicitly, but read from a textfile
# I have prepared.
tmp <- read.table("referenceDomainAnnotations.txt",
header = TRUE,
sep = "\t",
comment.char = "#",
strip.white = TRUE,
stringsAsFactors = FALSE)
# remove the notes column - that is in the text file, only for our reference,
# not part of the data model
tmp <- tmp[ , -(ncol(tmp))]
# add table IDs
for (i in 1:nrow(tmp)) {
tmp[i, "ID"] <- dbAutoincrement(tmp$ID, ns = "ref", code = "fan")
}
# add table to DB
refDB$proteinAnnotation <-
rbind(refDB$proteinAnnotation,
tmp)
# === system table ===
refDB$system <-
rbind(refDB$system,
data.frame(
ID = "ref_sys_1",
name = "G1/S SACCE",
notes = paste("Regulates transition from G1 to S phase",
"in the yeast cell cycle."),
stringsAsFactors = FALSE))
# === component table ===
refDB$component <-
rbind(refDB$component,
data.frame(
ID = "ref_cmp_1",
protein.ID = "ref_pro_4", # MBP1_SACCE
system.ID = "ref_sys_1", # G1/S SACCE
status = "include",
notes = paste("Part of MBF complex."),
stringsAsFactors = FALSE))
# === system annotation table ===
refDB$systemAnnotation <-
rbind(refDB$systemAnnotation,
data.frame(
ID = "ref_san_1",
system.ID = "ref_sys_1", # G1/S SACCE
feature.ID = "ref_ftr_8", # PubMed
stringsAsFactors = FALSE))
# === component annotation table ===
refDB$componentAnnotation <-
rbind(refDB$componentAnnotation,
data.frame(
ID = "ref_can_1",
component.ID = "ref_cmp_1", # Mbp1 in G1/S SACCE
feature.ID = "ref_ftr_8", # PubMed
stringsAsFactors = FALSE))
# === type table ===
refDB$type <-
rbind(refDB$type,
data.frame(
ID = "ref_typ_0",
name = "UNDEF",
description = "Undefined type",
stringsAsFactors = FALSE))
# === save
save(refDB, file = "data/refDB.RData")
# [END]

220
ABC-makeYFOlist.R Normal file
View File

@ -0,0 +1,220 @@
# ABC_makeYFOlist.R
#
# Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues.
#
# Version: 1.1
#
# Date: 2016 09 - 2017 08
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.1 Update 2017
# V 1.0 First code 2016
#
# TODO:
# actually rerun for 2017
# type out workflow
#
# ==============================================================================
# DO NOT source() THIS FILE!
# This file is code I provide for your deeper understanding of a process and
# to provide you with useful sample code. It is not actually necessary for
# you to run this code, but I encourage you to read it carefully and discuss
# if there are parts you don't understand.
# Run the commands that interact with the NCBI servers only if you want to
# experiment with the code and/or parameters. I have commented out those
# parts. If you simply want to reproduce the process you can simply
# load() the respective intermediate results.
# ==============================================================================
# CREATING A YFO LIST
# ==============================================================================
# This script will create a list of "YFO" species and save it in an R object
# YFOspecies that is stored in the data subdirectory of this project from where
# it can be loaded.
# ==== GOLD species ============================================================
#
# Fetch and parse genome data from the NCBI genome project database
# === Initialize
if (!require(httr)) { # httr provides interfaces to Webservers on the Internet
install.packages("httr")
library(httr)
}
# The URL where the genome data can be downloaded
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
# Read the data directly from the NCBI ftp server and put it into a dataframe.
# This will take about a minute.
# GOLDdata <- read.csv(URL,
# header = TRUE,
# sep = "\t",
# stringsAsFactors = FALSE)
# save(GOLDdata, file="data/GOLDdata.RData")
load(file="data/GOLDdata.RData")
# What columns does the table have, how is it structured?
str(GOLDdata)
# What groups of organisms are in the table? How many of each?
table(GOLDdata$Group)
# What subgroups of fungi do we have?
table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
# How many of the fungi have protein annotations?
sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
# Get a subset of the data, with fungi that have protein annotations
GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
GOLDdata$Proteins != "-" , ]
# check what we have in the table
head(GOLDfungi)
# For our purpose of defining species, we pick only the first two words from the
# "X.Organism.Name" column ... here is a function to do this:
#
makeBinomial <- function(s) {
# input:
# s: a string which is expected to contain a binomial
# species name as the first two words, followed by other text
# output:
# the first two words separeted by a single blank
#
x <- unlist(strsplit(s, "\\s+")) # split second element on
# one or more whitespace
return(paste(x[1:2], collapse=" ")) # return first two elements
}
# iterate through GOLDdata and extract species names
GOLDspecies <- character()
for (i in 1:nrow(GOLDfungi)) {
GOLDspecies[i] <- makeBinomial(GOLDfungi$X.Organism.Name[i])
}
# Species of great interest may may appear more than once, one for each sequenced strain: e.g. brewer's yeast:
sum(GOLDspecies == "Saccharomyces cerevisiae")
# Therefore we use the function unique() to throw out duplicates. Simple:
GOLDspecies <- unique(GOLDspecies)
length(GOLDspecies)
# i.e. we got rid of about half of the species.
# ==== BLAST species ===========================================================
#
# Use BLAST to fetch proteins related to Mbp1 and identifying the species that
# contain them.
#
# Scripting agains NCBI APIs is not exactly enjoyable - there is usually a fair
# amount of error handling involved that is not supported by the API in a
# principled way but requires rather ad hoc solutions. The code I threw
# together to make a BLAST interface for the course is in the file BLAST.R
# Feel encouraged to study how this works.
source("BLAST.R") # load the function and its utilites
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
# hits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
# nHits = 1000, # 633 hits in 2016
# E = 0.01, #
# limits = "txid4751[ORGN]") # = fungi
# save(hits, file="data/BLASThits.RData")
load(file="data/BLASThits.RData")
# This is a very big list that can't be usefully analyzed manually. Here
# we are only interested in the species names that it contains.
# How many hits in the list?
length(hits$hits)
# Let's look at the first one
str(hits$hit[[1]])
# the species information is in the $def element - the definition line of the
# sequence record ... but we need a function to retrieve it. This one is a great
# example, because it is really messy. Have a look:
hits$hits[[1]]$def
# We get so many species because the exact same hit has been found by BLAST in a
# number of RefSeqs sequence records, all of which are different strains of
# yeast. We only need one of those though, any one, so we shall parse out the
# first one. For this, We will simply use the immensely versatile strsplit()
# function, split on square brackets, take the second element of the resulting
# array and run that through our makeBinomial function.
# define the function (i.e. execute the lines below)
parseDeflineSpecies <- function(s) {
# input:
# s: a string which is expected to contain a binomial
# species name in square brackets embedded in other text
# output:
# the species name found the first bracketed string
#
x <- unlist(strsplit(s, "\\]|\\[")) # split on "]" or "[" characters
return(makeBinomial(x[2])) # return Binomial name from
# second element of vector
}
#test it
parseDeflineSpecies(hits$hits[[1]]$def)
parseDeflineSpecies(hits$hits[[11]]$def)
parseDeflineSpecies(hits$hits[[111]]$def)
# now run a simple loop to extract all the species names into a vector
BLASTspecies <- character()
for (i in 1:length(hits$hits)) {
BLASTspecies[i] <- parseDeflineSpecies(hits$hits[[i]]$def)
}
# You can confirm in the Values section of the Environment pane that
# BLASTspecies has the expected size. Again, species may appear more than once,
# e.g.
sum(BLASTspecies == "Saccharomyces cerevisiae")
# Therefore we use the function unique() to throw out duplicates. Simple:
BLASTspecies <- unique(BLASTspecies)
length(BLASTspecies)
# i.e. we got rid of about one third of the species.
#
# You should think about this: what does it mean that on average we have three
# hits by sequence similarity to Mbp1 in other species?
# ==== Intersecting BLAST and GOLD species lists ===============================
# Now we can compare the two lists for species that appear in both sources: the
# simplest way is to use the set operation functions union(), intersection()
# etc. See here:
?union
YFOspecies <- intersect(GOLDspecies, BLASTspecies)
# Just one final thing: some of the species will be our so-called "reference" species for which I will develop model solutions. I have defined them in the .utilities.R file to make them available for future purposes. separately and remove them from the list.
#
REFspecies
YFOspecies <- sort(setdiff(YFOspecies, REFspecies))
# save(YFOspecies, file = "data/YFOspecies.RData")
# [END]

View File

@ -12,29 +12,49 @@
# TODO:
#
#
# == HOW TO WORK WITH THIS FILE ================================================
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# Go through this script line by line to read and understand the
# code. Execute code by typing <cmd><enter>. When nothing is
# selected, that will execute the current line and move the cursor to
# the next line. You can also select more than one line, e.g. to
# execute a block of code, or less than one line, e.g. to execute
# only the core of a nested expression.
# Expect that the learning unit files will be continuously updated.
#
# Edit code, as required, experiment with options, or just play.
# Especially play.
# If you wish to edit any of the code, for example to add your own comments and
# examples, save any edited version under a different name. Otherwise you will
# have problems with git when you update the project to a new version.
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# DO NOT simply source() this whole file!
# While this file itself should not be edited by you this is YOUR project
# directory, and files that you create (notes etc.) will not be harmed when you
# pull updated version of the master, or other new files, from github.
#
# If there are portions you don't understand, use R's help system,
# Google for an answer, or ask me. Don't continue if you don't
# understand what's going on. That's not how it works ...
# If you pull from github and get the following type of error ...
# ---------------
# error: Your local changes to the following files would be
# overwritten by merge
# ...
# Please commit your changes or stash them before you can merge.
# ---------------
# ... then, you need to bring the offending file into its original state.
# Open the Commit window, select the file, and click on the Revert button.
#
# Once you have typed and executed the function init(), you will find a file
# called myScript.R in the project directory.
# Of course, you can save a local copy under a different name before you revert,
# in case you want to keep your changes.
#
#
# ==============================================================================
# Once you have typed and executed the function init(), you will find a file
# called myScript.R in the project directory.
#
# Open it, you can place all of your code-experiments and notes into that
# file. This will complement your "Course Journal". If you keep all contents in
# this one file, you can find everything by using the <cmd>-F find function. To
# cross-reference code in your journal, create section headings.
#
# Open it, you can place all of your code-experiments and notes into that
# file. This will be your "Lab Journal" for this session.
#
# ==============================================================================

256
BIN-ALI-BLAST.R Normal file
View File

@ -0,0 +1,256 @@
# BIN-ALI-BLAST.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-BLAST unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# BLAST.R
#
# Purpose: Send off one BLAST search and return parsed list of results
# This script uses the BLAST URL-API
# (Application Programming Interface) at the NCBI.
# Read about the constraints here:
# http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYP=DeveloperInfo
#
#
# Version: 1.0
# Date: 2016-09
# Author: Boris Steipe
#
#
# ToDo:
# Notes: The bioconducter "annotate" package contains code for BLAST searches,
# in case you need to do something more involved.
#
# ==============================================================================
# Dependencies: myEmail must exist as a global variable with
# your valid email adress
# waitTimer() must be loaded (it should have been loaded from
# .utilities.R, which was sourced via .Rprofile)
# library to interface with WebServers and process their XML/HTML
# responses
if (!require(xml2, quietly = TRUE)) {
install.packages("xml2")
library(xml2)
}
if (!require(httr, quietly = TRUE)) {
install.packages("httr")
library(httr)
}
parseBLAST_XML <- function(hit) {
# parse one BLAST hit XML node with the xml2 package;
# return a list
h <- list()
h$id <- xml_text(xml_find_first(hit, ".//Hit_accession"))
h$def <- xml_text(xml_find_first(hit, ".//Hit_def"))
h$bestE <- Inf
h$sumLen <- 0
h$sumId <- 0
h$sumGap <- 0
hsps <- xml_find_all(hit, ".//Hsp")
h$Hsp <- list()
h$nHsps <- length(hsps)
if (h$nHsps > 0) {
for (i in 1:length(hsps)) {
h$Hsp[[i]] <- list()
h$Hsp[[i]]$e <- xml_numeric(hsps[i], ".//Hsp_evalue")
h$Hsp[[i]]$q_from <- xml_numeric(hsps[i], ".//Hsp_query-from")
h$Hsp[[i]]$q_to <- xml_numeric(hsps[i], ".//Hsp_query-to")
h$Hsp[[i]]$h_from <- xml_numeric(hsps[i], ".//Hsp_hit-from")
h$Hsp[[i]]$h_to <- xml_numeric(hsps[i], ".//Hsp_hit-to")
h$Hsp[[i]]$h_identity <- xml_numeric(hsps[i], ".//Hsp_identity")
h$Hsp[[i]]$h_gaps <- xml_numeric(hsps[i], ".//Hsp_gaps")
h$Hsp[[i]]$h_len <- xml_numeric(hsps[i], ".//Hsp_align-len")
h$Hsp[[i]]$qseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_qseq"))
h$Hsp[[i]]$mid <- xml_text(xml_find_first(hsps[i], ".//Hsp_midline"))
h$Hsp[[i]]$hseq <- xml_text(xml_find_first(hsps[i], ".//Hsp_hseq"))
h$bestE <- min(h$bestE, h$Hsp[[i]]$e)
h$sumLen <- h$sumLen + h$Hsp[[i]]$h_len
h$sumId <- h$sumId + h$Hsp[[i]]$h_identity
h$sumGap <- h$sumGap + h$Hsp[[i]]$h_gaps
}
}
return(h)
}
xml_numeric <- function(n, p) {
# Utility: return first node matching xpath p in XML node n as numeric
return(as.numeric(xml_text(xml_find_first(n, p))))
}
BLAST <- function(q,
db = "refseq_protein",
nHits = 30,
E = 3,
limits = "\"\"",
email = myEMail,
rid = "",
quietly = FALSE) {
# Purpose:
# Basic BLAST search
# Version: 1.0
# Date: 2016-09
# Author: Boris Steipe
#
# Parameters:
# q: query - either a valid ID or a sequence
# db: "refseq_protein" by default,
# other legal valuses include: "nr", "pdb", "swissprot" ...
# nHits: number of hits to maximally return
# E: E-value cutoff. Do not return hits whose score would be expected
# to occur E or more times in a database of random sequence.
# limits: a valid ENTREZ filter
# email: a valid email address, defaults to global value myEMail
# quietly: controls printing of wait-time progress bar
# Value:
# result: list of resulting hits and some metadata
results <- list()
results$rid <- rid
results$rtoe <- 0
if (rid == "") { # prepare, send and analyse query
results$query <- paste(
"https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"QUERY=", q,
"&DATABASE=", db,
"&HITLIST_SIZE=", as.character(nHits),
"&EXPECT=", as.character(E),
"&PROGRAM=", "blastp",
"&ENTREZ_QUERY=", limits,
"&NOHEADER=", "true",
"&EMAIL=", email,
"&CMD=Put",
sep = "")
# send it off ...
response <- read_xml(results$query, as_html = TRUE)
# find the comment node that contains the information we need
# using an xpath expression
info <- xml_find_first(response,
"//comment()[contains(., \"QBlastInfo\")]")
info <- xml_text(info) # extract its contents
# parse
results$rid <- regmatches(info,
regexec("RID = (\\w+)", info))[[1]][2]
results$rtoe <- regmatches(info,
regexec("RTOE = (\\d+)", info))[[1]][2]
results$rtoe <- as.numeric(results$rtoe)
} # done analysing query
# Now we wait ...
if (quietly) {
Sys.sleep(results$rtoe)
} else {
cat(sprintf("BLAST is processing %s:\n", results$rid))
waitTimer(results$rtoe)
}
# retrieve results from BLAST server
URL <- paste("https://www.ncbi.nlm.nih.gov/blast/Blast.cgi",
"?",
"RID=", results$rid,
"&FORMAT_TYPE=", "XML",
"&EMAIL=", email,
"&CMD=Get",
sep = "")
raw <- GET(URL)
timeOut <- 300
nWait <- 0
while (raw$headers["content-type"] == "text/html" && nWait <= (timeOut/10)) {
cat("Doesn't seem to be done. Wait some more (or click STOP to abort)\n")
waitTimer(10)
nWait <- nWait + 1
raw <- GET(URL)
}
# If we get to here, we received some result. But what?
if (raw$headers["content-type"] == "text/html") { # Still HTML? Didn't complete ...
stop(sprintf("Query >>%s<< didn't complete.", results$rid))
} else if (raw$headers["content-type"] == "application/xml") { # Good!
response <- read_xml(raw)
} else { # Unknown, abort.
stop(sprintf("Unknown response type: >>%s<<.", raw$headers["content-type"]))
}
hits <- xml_find_all(response, ".//Hit")
if (length(hits) == 0) {
s <- "No hit returned.\n"
s <- c(s, sprintf("Check your query string:\n>>%s<<\n", results$query))
s <- c(s, sprintf("and/or try again later by typing:\n", results$rid))
s <- c(s, sprintf(" BLAST(rid = \"%s\")\n", results$rid))
stop(paste(s, collapse = ""))
}
results$hits <- list()
for (i in 1:length(hits)) {
results$hits[[i]] <- parseBLAST_XML(hits[i])
}
return(results)
}
# = 1 Tasks
# ==== TESTS ===================================================================
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
# sep="")
# q <- "NP_010227"
# fungi <- "txid4751[ORGN]"
#
# test <- BLAST("NP_010227",
# nHits = 1000,
# E = 0.01,
# limits = fungi)
# length(test$hits)
# [END]

170
BIN-ALI-Dotplot.R Normal file
View File

@ -0,0 +1,170 @@
# BIN-ALI-Dotplot.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Dotplot unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# First, we install and load the Biostrings package.
if (!require(Biostrings, quietly=TRUE)) {
source("https://bioconductor.org/biocLite.R")
biocLite("Biostrings")
library(Biostrings)
}
# Let's load BLOSUM62
data(BLOSUM62)
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
# matrix that has as many rows as one sequence, as many columns as another. Then
# we go through every cell of the matrix and enter the pairscore we encounter
# for the amino acid pair whose position corresponds to the row and column
# index. Finally we visualize the matrix in a plot.
#
# First we fetch our sequences and split them into single characters.
sel <- myDB$protein$name == "MBP1_SACCE"
MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
MBP1_YFO <- s2c(myDB$protein$sequence[sel])
# Check that we have two character vectors of the expected length.
str(MBP1_SACCE)
str(MBP1_YFO)
# How do we get the pairscore values? Consider: a single pair of amino acids can
# be obtained from sequence SACCE and YFO eg. from position 13 and 21 ...
MBP1_SACCE[13]
MBP1_YFO[21]
# ... using these as subsetting expressions, we can pull the pairscore
# from the MDM
BLOSUM62[MBP1_SACCE[13], MBP1_YFO[21]]
# First we build an empty matrix that will hold all pairscores ...
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_YFO)),
nrow = length(MBP1_SACCE), ncol = length(MBP1_YFO))
# ... then we loop over the sequences and store the scores in the matrix.
#
for (i in 1:length(MBP1_SACCE)) {
for (j in 1:length(MBP1_YFO)) {
dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_YFO[j]]
}
}
# Even though this is a large matrix, this does not take much time ...
# Let's have a look at a small block of the values:
dotMat[1:10, 1:10]
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
# the matrix correspond to an amino acid from MBP1_YFO.
# To plot this, we use the image() function. Here, with default parameters.
image(dotMat)
# Be patient, this takes a few moments to render: more than 500,000 values.
# Nice.
# What do you expect?
# What would similar sequences look like?
# What do you see?
#You migh notice a thin line of yellow along the diagonal, moving approximately
# from bottom left to top right, fading in and out of existence. This is the
# signature of extended sequence similarity.
# Let's magnify this a bit by looking at only the first 200 amino acids ...
image(dotMat[1:200, 1:200])
# ... and, according to our normal writing convention, we would like the
# diagonal to run from top-left to bottom-right since we write from left to
# right and from top to bottom...
image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
# ... and we would like the range of the x- and y- axis to correspond to the
# sequence position ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1))
# ... and labels! Axis labels would be nice ...
image(x = 1:200, y = 1:200, dotMat[1:200, 1:200], ylim=c(200,1),
xlab = "MBP1_YFO", ylab = "MBP1_SACCE" )
# ... and why don't we have axis-numbers on all four sides? Go, make that right
# too ...
len <- 200
image(x = 1:len, y = 1:len, dotMat[1:len, 1:len], ylim=c(len,1),
xlab = "MBP1_YFO", ylab = "MBP1_SACCE", axes = FALSE)
box()
axis(1, at = c(1, seq(10, len, by=10)))
axis(2, at = c(1, seq(10, len, by=10)))
axis(3, at = c(1, seq(10, len, by=10)))
axis(4, at = c(1, seq(10, len, by=10)))
# ... you get the idea, we can infinitely customize our plot. However a good way
# to do this is to develop a particular view for, say, a report or publication
# in a script and then put it into a function. I have put a function into the
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
# there already is a dotplot function in the seqinr package:
dotPlot(MBP1_SACCE, MBP1_YFO) # seqinr
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO") # Our's
# Which one do you prefer? You can probably see the block patterns that arise
# from segments of repetitive, low complexity sequence. But you probably have to
# look very closely to discern the faint diagonals that correspond to similar
# sequence.
# Let's see if we can enhance the contrast between distributed noise and the
# actual alignment of conserved residues. We can filter the dot matrix with a
# pattern that enhances diagonally repeated values. Every value in the matrix
# will be replaced by a weighted average of its neighborhood. Here is a
# diagonal-filter:
myFilter <- matrix(numeric(25), nrow = 5)
myFilter[1, ] <- c( 1, 0, 0, 0, 0)
myFilter[2, ] <- c( 0, 1, 0, 0, 0)
myFilter[3, ] <- c( 0, 0, 1, 0, 0)
myFilter[4, ] <- c( 0, 0, 0, 1, 0)
myFilter[5, ] <- c( 0, 0, 0, 0, 1)
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
dotPlot2(MBP1_SACCE, MBP1_YFO, xlab = "SACCE", ylab = "YFO", f = myFilter)
# I think the result shows quite nicely how the two sequences are globally
# related and where the regions of sequence similarity are. Play with this a bit
# ... Can you come up with a better filter? If so, eMail us.
# = 1 Tasks
# [END]

192
BIN-ALI-MSA.R Normal file
View File

@ -0,0 +1,192 @@
# BIN-ALI-MSA.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-MSA unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 Multiple sequence alignment
# We will compute a multiple sequence alignment using the "muscle" algorithm
# which is available throught the Bioconductor msa package.
if (!require(Biostrings, quietly=TRUE)) {
source("https://bioconductor.org/biocLite.R")
biocLite("Biostrings")
}
data(BLOSUM62)
if (!require(msa, quietly=TRUE)) {
source("https://bioconductor.org/biocLite.R")
biocLite("msa")
library(msa)
}
# If the installation asks you if you want to update older packages, always
# answer "a" for "all" unless you have an important reason not to. But if the
# installer asks you whether you want to compile from source, answer"n" for "no"
# unless you need new functionality in a particular bleeding-edge version of a
# package.
help(package = "msa")
# We have used biostrings' AAString() function before; for multiple
# alignments we need an AAStringSet(). We can simply feed it
# a vector of sequences:
# Let's make a shorthand for selection of Mbp1 proteins from our database: a
# vector of indices for all of the rows in the protein table that contain
# proteins whose name begins with MBP1.
iMbp1Proteins <- grep("^MBP1_", myDB$protein$name)
# Next we create an AAStringSet for all of those proteins
seqSet <- AAStringSet(myDB$protein$sequence[iMbp1Proteins])
# ... and align (which is very simple) ...
msaMuscle(
seqSet,
order = "aligned")
# ... but to help us make sense of the alignment we need to add the names for
# the sequences. names for a seqSet object are held in the ranges slot...
seqSet@ranges@NAMES <- myDB$protein$name[iMbp1Proteins]
seqSet
# This little step of adding names is actually really
# very important. That's because the aligned sequences
# are meaningless strings of characters unless we can
# easily identify their biological relationships.
# Creating MSAs that are only identified by e.g. their
# RefSeq ids is a type of cargo-cult bioinformatics
# that we encounter a lot. The point of the alignment
# is not to create it, but to interpret it!
# Let's align again, and assign the result ...
msa1 <- msaMuscle(
seqSet,
order = "aligned")
msa1
# ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
print(msa1, show=c("alignment", "complete"), showConsensus=FALSE)
# You see that the alignment object has sequence strings with hyphens as
# indel-characters. The names are printed to the console. And you also see that
# the order has not been preserved, but the most similar sequences are now
# adjacent to each other.
# Lets write the alignment to one of the common file formats: a multi-fasta
# file.
# Why oh why does the msa package not have a function to do this !!!
# Like, seriously ...
# ==== writeMFA
# Here is our own function to write a AAStringSet object to a multi-FASTA file.
writeMFA <- function(ali, file, blockSize = 50) {
if (missing(ali)) {
stop("Input object missing from arguments with no default.")
}
if (missing(file)) {
writeToFile <- FALSE
}
else {
writeToFile <- TRUE
sink(file) # divert output to file
}
# Extract the raw data from the objects depending on
# their respective class and put this
# into a named vector of strings.
if (class(ali)[1] == "MsaAAMultipleAlignment") {
strings <- character(nrow(ali))
for (i in 1:nrow(ali)) {
strings[i] <- as.character(ali@unmasked[i])
names(strings)[i] <- ali@unmasked@ranges@NAMES[i]
}
}
else if (class(ali)[1] == "AAStringSet") {
strings <- character(length(ali))
for (i in 1:length(ali)) {
strings[i] <- as.character(ali[i])
names(strings)[i] <- ali@ranges@NAMES[i]
}
}
else {
stop(paste("Input object of class",
class(ali)[1],
"can't be handled by this function."))
}
for (i in 1:length(strings)) {
# output FASTA header
cat(paste(">",
names(strings)[i],
"\n",
sep=""))
# output the sequence block by block ...
nLine <- 1
from <- 1
while (from < nchar(strings[i])) {
to <- from + blockSize - 1
cat(paste(substr(strings[i], from, to), "\n", sep=""))
from <- to + 1
}
cat("\n") # output an empty line
}
if (writeToFile) {
sink() # Done. Close the diversion.
}
}
# confirm that the function works
writeMFA(seqSet)
writeMFA(msa1)
# We could use this function to write the raw and aligned sequences to file like
# so:
# writeMFA(seqSet, file = "APSES_proteins.mfa")
# writeMFA(msa1, file = "APSES_proteins_muscle.mfa")
# ... but since we don't actually need the data on file now, just copy the code
# that would create the msa to your myCode file so you can quickly reproduce it.
# == Task:
# Print the output of print(msa1) on a sheet of paper and bring it to
# class for the next quiz.
# That'll do.
# = 1 Tasks
# [END]

View File

@ -0,0 +1,290 @@
# BIN-ALI-Optimal_sequence_alignment.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 Biostrings Pairwise Alignment
# Biostrings is one of the basic packages that the Bioconductor software
# landscape builds on. It stores sequences in "AAstring" objects and these are
# complex software structures that are designed to be able to handle
# genome-scale sequences. Biostrings functions - such as the alignment functions
# - expect their input to be Biostrings objects.
?AAString
AAString("ACDE")
s <- AAString("ACDE")
str(s)
# See: it's complicated. This is an "S4" object. Bioconductor uses these objects
# almost exclusively, but we will not be poking around in their internals. Just
# this: how do we get the sequence back out of an AAString object? The help page
# for XString - the parent "class" of AAStrings - mentions the alternatives:
as.character(s) # the base R version
toString(s) # using the Biostrings function toString()
# While we need to remember to convert our sequences from the character vectors
# that we store in our database, to AAStrings that we can align, the alignment
# itself is really straightforward. The pairwiseAlignment() function was written
# to behave exactly like the functions you encountered on the EMBOSS server.
# First: make AAString objects ...
sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
?pairwiseAlignment
# ... and align.
# Global optimal alignment with end-gap penalties is default. (like EMBOSS needle)
ali1 <- pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_YFO,
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
str(ali1) # Did you think the AAString object was complicated ?
# This is a Biostrings alignment object. But we can use Biostrings functions to
# tame it:
ali1
writePairwiseAlignments(ali1) # That should look familiar
# And we can make the internal structure work for us (@ is for classes as
# $ is for lists ...)
str(ali1@pattern)
ali1@pattern
ali1@pattern@range
ali1@pattern@indel
ali1@pattern@mismatch
# or work with "normal" R functions
# the alignment length
nchar(ali1@pattern)
# the number of identities
sum(s2c(as.character(ali1@pattern)) ==
s2c(as.character(ali1@subject)))
# ... e.g. to calculate the percentage of identities
100 *
sum(s2c(as.character(ali1@pattern)) ==
s2c(as.character(ali1@subject))) /
nchar(ali1@pattern)
# ... which should be the same as reported in the writePairwiseAlignments()
# output. Awkward to type? Then it calls for a function:
#
percentID <- function(al) {
# returns the percent-identity of a Biostrings alignment object
return(100 *
sum(s2c(as.character(al@pattern)) ==
s2c(as.character(al@subject))) /
nchar(al@pattern))
}
percentID(ali1)
# Compare with local optimal alignment (like EMBOSS Water)
ali2 <- pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_YFO,
type = "local",
substitutionMatrix = "BLOSUM62",
gapOpening = 50,
gapExtension = 10)
writePairwiseAlignments(ali2) # This has probably only aligned the N-terminal
# DNA binding domain - but that one has quite
# high sequence identity:
percentID(ali2)
# == TASK: ==
# Compare the two alignments. I have weighted the local alignment heavily
# towards an ungapped alignment by setting very high gap penalties. Try changing
# the gap penalties and see what happens: how does the number of indels change,
# how does the length of indels change...
# Fine. Please return to the Wiki to study BLAST alignment...
# ==============================================================================
# PART FOUR: APSES Domain annotation by alignment
# ==============================================================================
# In this section we define the YFO APSES sequence by performing a global,
# optimal sequence alignment of the yeast domain with the full length protein
# sequence of the protein that was the most similar to the yeast APSES domain.
#
# I have annotated the yeast APSES domain as a proteinAnnotation in the
# database. To view the annotation, we can retrieve it via the proteinID and
# featureID. Here is the yeast protein ID:
myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"]
# ... assign it for convenience:
proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"]
# ... and if you look at the feature table, you can identify the feature ID
myDB$feature[ , c("ID", "name", "description")]
myDB$feature$ID[myDB$feature$name == "APSES fold"]
# ... assign it for convenience:
ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"]
# ... and with the two annotations we can pull the entry from the protein
# annotation table
myDB$proteinAnnotation[myDB$proteinAnnotation$protein.ID == proID &
myDB$proteinAnnotation$feature.ID == ftrID, ]
myDB$proteinAnnotation$ID[myDB$proteinAnnotation$protein.ID == proID &
myDB$proteinAnnotation$feature.ID == ftrID]
# ... assign it for convenience:
fanID <- myDB$proteinAnnotation$ID[myDB$proteinAnnotation$protein.ID == proID &
myDB$proteinAnnotation$feature.ID == ftrID]
# The annotation record contains the start and end coordinates which we can use
# to define the APSES domain sequence with a substr() expression.
substr(myDB$protein$sequence[myDB$protein$ID == proID],
myDB$proteinAnnotation$start[myDB$proteinAnnotation$ID == fanID],
myDB$proteinAnnotation$end[myDB$proteinAnnotation$ID == fanID])
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
# have selected from the sequence column of the protein table the sequence whose
# name is "MBP1_SACCE", and selected from the proteinAnnotation table the start
# and end coordinates of the annotation that joins an "APSES fold" feature with
# the sequence, and used the start and end coordinates to extract a substring.
# The expressions get lengthy, but it's not hard to wrap all of this into a
# function so that we only need to define name and feature.
dbGetFeatureSequence
dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")
# Let's convert this to an AAstring and assign it:
aaMB1_SACCE_APSES <- AAString(dbGetFeatureSequence(myDB,
"MBP1_SACCE",
"APSES fold"))
# To align, we need the YFO sequence. Here is it's definition again, just
# in case ...
sel <- myDB$protein$name == paste("MBP1_", biCode(YFO), sep = "")
aaMBP1_YFO <- AAString(myDB$protein$sequence[sel])
# Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the
# end-gap penalties off and that is crucially important since
# the sequences have very different length.
aliApses <- pairwiseAlignment(
aaMB1_SACCE_APSES,
aaMBP1_YFO,
type = "overlap",
substitutionMatrix = "BLOSUM62",
gapOpening = 10,
gapExtension = 0.5)
# Inspect the result. The aligned sequences should be clearly
# homologous, and have (almost) no indels. The entire "pattern"
# sequence from QIYSAR ... to ... KPLFDF should be matched
# with the "query". Is this correct?
writePairwiseAlignments(aliApses)
# If this is correct, you can extract the matched sequence from
# the alignment object. The syntax is a bit different from what
# you have seen before: this is an "S4 object", not a list. No
# worries: as.character() returns a normal string.
as.character(aliApses@subject)
# Now, what are the aligned start and end coordinates? You can read them from
# the output of writePairwiseAlignments(), or you can get them from the range of
# the match.
str(aliApses@subject@range)
# start is:
aliApses@subject@range@start
# ... and end is:
aliApses@subject@range@start + aliApses@subject@range@width - 1
# Since we have this section defined now, we can create a feature annotation
# right away and store it in myDB. Copy the code-template below to your
# myCode.R file, edit it to replace the placeholder items with your data:
#
# - The <PROTEIN ID> is to be replaced with the ID of MBP1_YFO
# - The <FEATURE ID> is to be replaced with the ID of "APSES fold"
# - <START> and <END> are to be replaced with the coordinates you got above
#
# Then execute the code and continue below the code template. If you make an
# error, there are instructions on how to recover, below.
#
# ===== begin code template: add a proteinAnnotation to the database =====
# == edit all placeholder items!
myProteinID <- "<PROTEIN ID>"
myFeatureID <- "<FEATURE ID>"
myStart <- <START>
myEnd <- <END>
# == create the proteinAnnotation entry
panRow <- data.frame(ID = dbAutoincrement(myDB$proteinAnnotation$ID),
protein.ID = myProteinID,
feature.ID = myFeatureID,
start = myStart,
end = myEnd,
stringsAsFactors = FALSE)
myDB$proteinAnnotation <- rbind(myDB$proteinAnnotation, panRow)
# == check that this was successful and has the right data
myDB$proteinAnnotation[nrow(myDB$proteinAnnotation), ]
# ===== end code template ===========================================
# ... continue here.
# I expect that a correct result would look something like
# ID protein.ID feature.ID start end
# 63 my_fan_1 my_pro_1 ref_ftr_1 6 104
# If you made a mistake, simply overwrite the current version of myDB by loading
# your saved, good version: load("myDB.01.RData") and correct your mistake
# If this is correct, save it
save(myDB, file = "myDB.02.RData") # Note that it gets a new version number!
# Done with this part. Copy the sequence of the APSES domain of MBP1_<YFO> - you
# need it for the reverse BLAST search, and return to the course Wiki.
# = 1 Tasks
# [END]

69
BIN-ALI-Similarity.R Normal file
View File

@ -0,0 +1,69 @@
# BIN-ALI-Similarity.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Similarity unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 Mutation Data matrix
# First, we install and load the Biostrings package.
if (!require(Biostrings, quietly=TRUE)) {
source("https://bioconductor.org/biocLite.R")
biocLite("Biostrings")
library(Biostrings)
}
# Biostrings contains mutation matrices and other useful datasets
data(package = "Biostrings")
# Let's load BLOSUM62
data(BLOSUM62)
# ... and see what it contains. (You've seen this before, right?)
BLOSUM62
# We can simply access values via the row/column names to look at the data
# for the questions I asked in the Assignment on the Wiki:
BLOSUM62["H", "H"]
BLOSUM62["S", "S"]
BLOSUM62["L", "K"]
BLOSUM62["L", "I"]
BLOSUM62["R", "W"]
BLOSUM62["W", "R"] # the matrix is symmetric!
# = 1.1 <<<Subsection>>>
# = 1 Tasks
# [END]

View File

@ -0,0 +1,147 @@
# BIN-FUNC-Domain_annotation.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC-Domain_annotation unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 SMART Domain annotations
# Plot domain annotations as colored rectangles on a sequence.
# Step one: enter your domain annotations as features into the database.
#
# == Update myDB
# If the reference database has changed, we need to merge it in with myDB.
load("myDB.03.RData") # load the previous version of myDB
# the new version of refDB was loaded when you
# pulled it from GitHub, and then typed init()
myDB <- dbMerge(myDB) # merge the two databases and update myDB with the result
save(myDB, file = "myDB.04.RData") # save the new version
# == Update myDB
# Every annotated feature requires its own entry in the database. You have added
# the feature for the "APSES fold" before, so you can copy and edit that code
# from your myCode.R script. Here is again the table of feature IDs:
myDB$feature[ , c("ID", "name", "description")]
# Add every SMART annotated feaure for MBP1_YFO to the database. If you make
# mistakes, just reload the latest version (probably "myDB.04.RData"), then run
# your corrected annotation script again. Execute ...
myDB$proteinAnnotation
# ... to confirm.
#
# Once you are sure your annotations are correct, save the database again.
save(myDB, file = "myDB.05.RData") # save the new version
#
# Now let's plot the annotations.
#
# We need a small utility function that draws the annotation boxes on a
# representation of sequence. It will accept the left and right boundaries, the
# height and the color of the box and plot it using R's rect() function.
drawBox <- function(xLeft, xRight, y, colour) {
# Draw a box from xLeft to xRight at y, filled with colour
rect(xLeft, (y - 0.1), xRight, (y + 0.1),
border = "black", col = colour)
}
# test this:
plot(c(-1.5, 1.5), c(0, 0), type = "l")
drawBox(-1, 1, 0.0, "peachpuff")
# Next, we define a function to plot annotations for one protein: the name of
# the protein, a horizontal grey line for its length, and all of its features.
plotProtein <- function(DB, ID, y) {
# DB: protein database, probably you want myDB
# ID: the ID of the protein to plot.
# y: where to draw the plot
#
# Define colors: we create a vector of color values, one for
# each feature, and we give it names of the feature ID. Then we
# can easily get the color value from the feature name.
# A: make a vector of color values. The syntax may appear unusual -
# colorRampPalette() returns a function, and we simply append
# the parameter (number-of-features) without assigning the function
# to its own variable name.
ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
"#62C923", "#0A9A9B", "#1958C3",
"#8000D3", "#D0007F"),
space="Lab",
interpolate="linear")(nrow(DB$feature))
# B: Features may overlap, so we make the colors transparent by setting
# their "alpha channel" to 1/2 (hex: 7F)
ftrCol <- paste(ftrCol, "7F", sep = "")
# C: we asssign names
names(ftrCol) <- DB$feature$ID
# E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
# find the row-index of the protein ID in the protein table of DB
iProtein <- which(DB$protein$ID == ID)
# write the name of the protein
text(-30, y, adj=1, labels=DB$protein$name[iProtein], cex=0.75 )
#draw a line from 0 to nchar(sequence-of-the-protein)
lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
lwd=3, col="#999999")
# get the rows of feature annotations for the protein
iFtr <- which(DB$proteinAnnotation$protein.ID == ID)
# draw a colored box for each feature
for (i in iFtr) {
drawBox(DB$proteinAnnotation$start[i],
DB$proteinAnnotation$end[i],
y,
ftrCol[ DB$proteinAnnotation$feature.ID[i] ])
}
}
# Plot each annotated protein:
# Get the rows of all unique annotated protein IDs in the protein table
iRows <- which(myDB$protein$ID %in% unique(myDB$proteinAnnotation$protein.ID))
# define the size of the plot-frame to accomodate all proteins
yMax <- length(iRows) * 1.1
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1 # longest sequence
# plot an empty frame
plot(1,1, xlim=c(-200, xMax), ylim=c(0, yMax),
type="n", axes=FALSE, bty="n", xlab="sequence position", ylab="")
axis(1, at = seq(0, xMax, by = 100))
# Finally, iterate over all proteins and call plotProtein()
for (i in 1:length(iRows)) {
plotProtein(myDB, myDB$protein$ID[iRows[i]], i)
}
# The plot shows clearly what is variable and what is constant about the
# annotations in a group of related proteins. Print the plot and bring it to
# class for the next quiz.
#
# = 1 Tasks
# [END]

View File

@ -0,0 +1,250 @@
# BIN-PHYLO-Data_preparation.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Data_preparation unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# ==============================================================================
# PART ONE: Choosing sequences
# ==============================================================================
# Start by loading libraries. You already have the packages installed.
library(Biostrings)
library(msa)
library(stringr)
# What is the latest version of myDB that you have saved?
list.files(pattern = "myDB.*")
# ... load it (probably myDB.05.RData - if not, change the code below).
load("myDB.05.RData")
# The database contains the ten Mbp1 orthologues from the reference species
# and the Mbp1 RBM for YFO.
#
# We will construct a phylogenetic tree from the proteins' APSES domains.
# You have annotated their ranges as a feature.
# Collect APSES domain sequences from your database. The function
# dbGetFeatureSequence() retrieves the sequence that is annotated for a feature
# from its start and end coordinates. Try:
dbGetFeatureSequence(myDB, "MBP1_SACCE", "APSES fold")
# Lets put all APSES sequences into a vector:
APSESnames <- myDB$protein$name[grep("^MBP1_", myDB$protein$name)]
APSES <- character(length(APSESnames))
for (i in 1:length(APSESnames)) {
APSES[i] <- dbGetFeatureSequence(myDB, APSESnames[i], "APSES fold")
}
# Let's name the rows of our vector with the BiCode part of the protein name.
# This is important so we can keep track of which sequence is which. We use the
# gsub() funcion to substitute "" for "MBP1_", thereby deleting this prefix.
names(APSES) <- gsub("^MBP1_", "", APSESnames)
# inspect the result: what do you expect? Is this what you expect?
head(APSES)
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
# phylogegetic tree (see the Assignment Course Wiki page for details on the
# sequence).
APSES[length(APSES) + 1] <-
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ"
names(APSES)[length(APSES)] <- "ESCCO"
# ==============================================================================
# PART TWO: Multiple sequence alignment
# ==============================================================================
# This vector of sequences with named elements fulfills the requirements to be
# imported as a Biostrings object - an AAStringSet - which we need as input for
# the MSA algorithms in Biostrings.
#
APSESSeqSet <- AAStringSet(APSES)
APSESMsaSet <- msaMuscle(APSESSeqSet, order = "aligned")
# inspect the alignment.
writeSeqSet(APSESMsaSet, format = "ali")
# What do you think? Is this a good alignment for phylogenetic inference?
# ==============================================================================
# PART THREE: reviewing and editing alignments
# ==============================================================================
# Head back to the assignment 7 course wiki page and read up on the background
# first.
#
# Let's mask out all columns that have observations for
# less than 1/3 of the sequences in the dataset. This
# means they have more than round(nrow(msaSet) * (2/3))
# hyphens in a column.
#
# We take all sequences, split them into single
# characters, and put them into a matrix. Then we
# go through the matrix, column by column and decide
# whether we want to include that column.
# Step 1. Go through this by hand...
# get the length of the alignment
lenAli <- APSESMsaSet@unmasked@ranges@width[1]
# initialize a matrix that can hold all characters
# individually
msaMatrix <- matrix(character(nrow(APSESMsaSet) * lenAli),
ncol = lenAli)
# assign the correct rownames
rownames(msaMatrix) <- APSESMsaSet@unmasked@ranges@NAMES
for (i in 1:nrow(APSESMsaSet)) {
seq <- as.character(APSESMsaSet@unmasked[i])
msaMatrix[i, ] <- unlist(strsplit(seq, ""))
}
# inspect the result
msaMatrix[1:5, ]
# Now let's make a logical vector with an element
# for each column that selects which columns should
# be masked out.
# To count the number of elements in a vector, R has
# the table() function. For example ...
table(msaMatrix[ , 1])
table(msaMatrix[ , 10])
table(msaMatrix[ , 20])
table(msaMatrix[ , 30])
# Since the return value of table() is a named vector, where
# the name is the element that was counted in each slot,
# we can simply get the counts for hyphens from the
# return value of table(). We don't even need to assign
# the result to an intermediate variable, but we
# can attach the selection via square brackets,
# i.e.: ["-"], directly to the function call:
table(msaMatrix[ , 1])["-"]
# ... to get the number of hyphens. And we can compare
# whether it is eg. > 4.
table(msaMatrix[ , 1])["-"] > 4
# Thus filling our logical vector is really simple:
# initialize the mask
colMask <- logical(lenAli)
# define the threshold for rejecting a column
limit <- round(nrow(APSESMsaSet) * (2/3))
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
# hyphens, FALSE if there are more.
for (i in 1:lenAli) {
count <- table(msaMatrix[ , i])["-"]
if (is.na(count)) { # No hyphen
count <- 0
}
colMask[i] <- count <= limit
}
# inspect the mask
colMask
# How many positions were masked? R has a simple trick
# to count the number of TRUE and FALSE in a logical
# vector. If a logical TRUE or FALSE is converted into
# a number, it becomes 1 or 0 respectively. If we use
# the sum() function on the vector, the conversion is
# done implicitly. Thus ...
sum(colMask)
# ... gives the number of TRUE elements.
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
100 * (1 - (sum(colMask) / length(colMask)))))
# Next, we use colMask to remove the masked columns from the matrix
# in one step:
maskedMatrix <- msaMatrix[ , colMask]
# check:
ncol(maskedMatrix)
# ... then collapse each row back into a sequence ...
apsMaskedSeq <- character()
for (i in 1:nrow(maskedMatrix)) {
apsMaskedSeq[i] <- paste(maskedMatrix[i, ], collapse="")
}
names(apsMaskedSeq) <- rownames(maskedMatrix)
# ... and read it back into an AAStringSet object
apsMaskedSet <- AAStringSet(apsMaskedSeq)
# inspect ...
writeSeqSet(apsMaskedSet, format = "ali")
# Step 2. Turn this code into a function...
# Even though the procedure is simple, doing this more than once is tedious and
# prone to errors. I have assembled the steps we just went through into a
# function maskSet() and put it into the utilities.R file, from where it has
# been loaded when you started this sesssion.
maskSet
# Check that the function gives identical results
# to what we did before by hand:
identical(apsMaskedSet, maskSet(APSESMsaSet))
# The result must be TRUE. If it's not TRUE you have
# an error somewhere.
# We save the aligned, masked domains to a file in multi-FASTA format.
writeSeqSet(maskSet(APSESMsaSet), file = "APSES.mfa", format = "mfa")
# = 1 Tasks
# [END]

191
BIN-PHYLO-Tree_analysis.R Normal file
View File

@ -0,0 +1,191 @@
# BIN-PHYLO-Tree_analysis.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# ==============================================================================
# PART FIVE: Tree analysis
# ==============================================================================
# A Entrez restriction command
cat(paste(paste(c(myDB$taxonomy$ID, "83333"), "[taxid]", sep=""), collapse=" OR "))
# The Common Tree from NCBI
# Download the EDITED phyliptree.phy
commonTree <- read.tree("phyliptree.phy")
# Plot the tree
plot(commonTree, cex=1.0, root.edge=TRUE, no.margin=TRUE)
nodelabels(text=commonTree$node.label, cex=0.6, adj=0.2, bg="#D4F2DA")
# === Visualizing your tree ====================================================
# The trees that are produced by Rphylip are stored as an object of class
# "phylo". This is a class for phylogenetic trees that is widely used in the
# community, practically all R phylogenetics packages will options to read and
# manipulate such trees. Outside of R, a popular interchange format is the
# Newick_format that you have seen above. It's easy to output your calculated
# trees in Newick format and visualize them elsewhere.
# The "phylo" class object is one of R's "S3" objects and methods to plot and
# print it have been added to the system. You can simply call plot(<your-tree>)
# and R knows what to do with <your-tree> and how to plot it. The underlying
# function is plot.phylo(), and documentation for its many options can by found
# by typing:
?plot.phylo
plot(apsTree) # default type is "phylogram"
plot(apsTree, type="unrooted")
plot(apsTree, type="fan", no.margin = TRUE)
# rescale to show all of the labels:
# record the current plot parameters ...
tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE)
# ... and adjust the plot limits for a new plot
plot(apsTree,
type="fan",
x.lim = tmp$x.lim * 1.8,
y.lim = tmp$y.lim * 1.8,
cex = 0.8,
no.margin = TRUE)
# Inspect the tree object
str(apsTree)
apsTree$tip.label
apsTree$edge
apsTree$edge.length
# show the node / edge and tip labels on a plot
plot(apsTree)
nodelabels()
edgelabels()
tiplabels()
# show the number of nodes, edges and tips
Nnode(apsTree)
Nedge(apsTree)
Ntip(apsTree)
# Finally, write the tree to console in Newick format
write.tree(apsTree)
# === Rooting Trees ============================================================
# In order to analyse the tree, it is helpful to root it first and reorder its
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
is.rooted(apsTree)
# You can root the tree with the command root() from the "ape" package. ape is
# automatically installed and loaded with Rphylip.
plot(apsTree)
# add labels for internal nodes and tips
nodelabels(cex=0.5, frame="circle")
tiplabels(cex=0.5, frame="rect")
# The outgroup of the tree is tip "8" in my sample tree, it may be a different
# number in yours. Substitute the correct node number below for "outgroup".
apsTree <- root(apsTree, outgroup = 8, resolve.root = TRUE)
plot(apsTree)
is.rooted(apsTree)
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
# an edge of length zero was added to connect the MRCA (Most Recent Common
# Ancestor) of the ingroup.
# The edge lengths are stored in the phylo object:
apsTree$edge.length
# ... and you can assign a small arbitrary value to the edge
# to show how it connects to the tree without having an
# overlap.
apsTree$edge.length[1] <- 0.1
plot(apsTree, cex=0.7)
nodelabels(text="MRCA", node=12, cex=0.5, adj=0.1, bg="#ff8866")
# This procedure does however not assign an actual length to a root edge, and
# therefore no root edge is visible on the plot. Why? , you might ask. I ask
# myself that too. We'll just add a length by hand.
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
plot(apsTree, cex=0.7, root.edge=TRUE)
nodelabels(text="MRCA", node=12, cex=0.5, adj=0.8, bg="#ff8866")
# === Rotating Clades ==========================================================
# To interpret the tree, it is useful to rotate the clades so that they appear
# in the order expected from the cladogram of species.
# We can either rotate around individual internal nodes:
layout(matrix(1:2, 1, 2))
plot(apsTree, no.margin=TRUE, root.edge=TRUE)
nodelabels(node=17, cex=0.7, bg="#ff8866")
plot(rotate(apsTree, node=17), no.margin=TRUE, root.edge=TRUE)
nodelabels(node=17, cex=0.7, bg="#88ff66")
layout(matrix(1), widths=1.0, heights=1.0)
# ... or we can plot the tree so it corresponds as well as possible to a
# predefined tip ordering. Here we use the ordering that NCBI Global Tree
# returns for the reference species - we have used it above to make the vector
# apsMbp1Names. You inserted your YFO name into that vector - but you should
# move it to its correct position in the cladogram.
# (Nb. we need to reverse the ordering for the plot. This is why we use the
# expression [nOrg:1] below instead of using the vector directly.)
nOrg <- length(apsTree$tip.label)
layout(matrix(1:2, 1, 2))
plot(commonTree,
no.margin=TRUE, root.edge=TRUE)
nodelabels(text=commonTree$node.label, cex=0.5, adj=0.2, bg="#D4F2DA")
plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
no.margin=TRUE, root.edge=TRUE)
add.scale.bar(length=0.5)
layout(matrix(1), widths=1.0, heights=1.0)
# Study the two trees and consider their similarities and differences. What do
# you expect? What do you find?
#
# Print the two trees on one sheet of paper, write your name and student number,
# and bring it to class as your deliverable for this assignment. Also write two
# or three sentences about if/how the gene tree matches the species tree or not.
# = 1 Tasks
# [END]

113
BIN-PHYLO-Tree_building.R Normal file
View File

@ -0,0 +1,113 @@
# ___ID___ .R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the ___ID___ unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# ==============================================================================
# PART FOUR: Calculating trees
# ==============================================================================
# Follow the instructions found at phylip's home on the Web to install. If you
# are on a Windows computer, take note of the installation directory.
# After you have installed Phylip on your computer, install the R package that
# provides an interface to the Phylip functions.
if (!require(Rphylip, quietly=TRUE)) {
install.packages("Rphylip")
library(Rphylip)
}
# This will install RPhylip, as well as its dependency, the package "ape".
# The next part may be tricky. You will need to figure out where
# on your computer Phylip has been installed and define the path
# to the proml program that calculates a maximum-likelihood tree.
# On the Mac, the standard installation places a phylip folder
# in the /Applications directory. That folder contains all the
# individual phylip programs as <name>.app files. These are not
# the actual executables, but "app" files are actually directories
# that contain the required resources for a program to run.
# The executable is in a subdirectory and you can point Rphylip
# directly to that subdirectory to find the program it needs:
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
# On Windows you need to know where the rograms have been installed, and you
# need to specify a path that is correct for the Windows OS. Find the folder
# that is named "exe", and right-click to inspect its properties. The path
# should be listed among them.
# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
# assignment has to be
# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
# (Note: "/", not "\")
# I have heard that your path must not contain spaces, and it is prudent to
# avoid other special characters as well.
# If you are running Linux I trust you know what to do. It's probably
# something like
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
# Confirm that the settings are right.
PROMLPATH # returns the path
list.dirs(PROMLPATH) # returns the directories in that path
list.files(PROMLPATH) # lists the files
# If "proml" is NOT among the files that the last command returns, you
# can't continue.
# Now read the mfa file you have saved, as a "proseq" object with the
# read.protein() function of the RPhylip package:
apsIn <- read.protein("APSES.mfa")
apsIn <- read.protein("~/Desktop/APSES_HISCA.mfa")
# ... and you are ready to build a tree.
# Building maximum-likelihood trees can eat as much computer time
# as you can throw at it. Calculating a tree of 48 APSES domains
# with default parameters of Rproml() runs for more than half a day
# on my computer. But we have only twelve sequences here, so the
# process will take us about 5 to 10 minutes.
apsTree <- Rproml(apsIn, path=PROMLPATH)
# A quick first look:
plot(apsTree)
# = 1 Tasks
# [END]

229
BIN-PPI-Analysis.R Normal file
View File

@ -0,0 +1,229 @@
# BIN-PPI-Analysis.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PPI-Analysis unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# ==============================================================================
# PART FOUR: EXPLORE FUNCTIONAL EDGES IN THE HUMAN PROTEOME
# ==============================================================================
# In order for you to explore some real, biological networks, I give you a
# dataframe of functional relationships of human proteins that I have downloaded from the
# STRING database. The full table has 8.5 million records, here is a subset of
# records with combined confidence scores > 980
# The selected set of edges with a confidence of > 980 is a dataframe with about
# 50,000 edges and 6,500 unique proteins. You can load the saved dataframe here
# (and also read more about what the numbers mean at
# http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
load("STRINGedges.RData")
head(STRINGedges)
# make a graph from this dataframe
?graph_from_data_frame
gSTR <- graph_from_data_frame(STRINGedges)
# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
# layout of such large graphs is possible, but requires specialized code. Google
# for <layout large graphs> if you are curious. Also, consider what one can
# really learn from plotting such a graph ...
# Of course simple computations on this graph are reasonably fast:
compSTR <- components(gSTR)
summary(compSTR) # our graph is fully connected!
dg <- degree(gSTR)
hist(log(dg), col="#FEE0AF")
# this actually does look rather scale-free
(freqRank <- table(dg))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
pch = 21, bg = "#FEE0AF",
xlab = "log(Rank)", ylab = "log(frequency)",
main = "6,500 nodes from the human functional interaction network")
# This looks very scale-free indeed.
#
# Now explore some more:
# === CLIQUES ========
# Let's find the largest cliques. Remember: a clique is a fully connected
# subgraph, i.e. a subgraph in which every node is connected to every other.
# Biological complexes often appear as cliques in interaction graphs.
clique_num(gSTR)
# The largest clique has 63 members.
largest_cliques(gSTR)[[1]]
# Pick one of the proteins and find out what this fully connected cluster of 63
# proteins is (you can simply Google for the ID). Is this expected?
# === BETWEENNESS CENTRALITY =======================================
# Let's find the nodes with the 10 - highest betweenness centralities.
#
BC <- centr_betw(gSTR)
# remember: BC$res contains the results
head(BC$res)
BC$res[1] # betweeness centrality of node 1 in the graph ...
# ... which one is node 1?
V(gSTR)[1]
# to get the ten-highest nodes, we simply label the elements BC with their
# index ...
names(BC$res) <- as.character(1:length(BC$res))
# ... and then we sort:
sBC <- sort(BC$res, decreasing = TRUE)
head(sBC)
# This ordered vector means: node 3,862 has the highest betweeness centrality,
# node 1,720 has the second highest.. etc.
BCsel <- as.numeric(names(sBC)[1:10])
BCsel
# We can use the first ten labels to subset the nodes in gSTR and fetch the
# IDs...
ENSPsel <- names(V(gSTR)[BCsel])
# We are going to use these IDs to produce some output for you to print out and
# bring to class, so I need you to personalize ENSPsel with the following
# two lines of code:
set.seed(myStudentNumber)
ENSPsel <- sample(ENSPsel)
# We also need to remove the string "9606." from the ID:
ENSPsel <- gsub("9606\\.", "", ENSPsel)
# This is the final vector of IDs.:
ENSPsel
# Could you define in a short answer quiz what these IDs are? And what their
# biological significance is? I'm probably not going to ask you to that on the
# quiz, but I expect you to be able to.
# Next, to find what these proteins are...
# We could now Google for all of these IDs to learn more about them. But really,
# googling for IDs one after the other, that would be lame. Let's instead use
# the very, very useful biomaRt package to translate these Ensemble IDs into
# gene symbols.
# == biomaRt =========================================================
# IDs are just labels, but for _bio_informatics we need to learn more about the
# biological function of the genes or proteins that we retrieve via graph data
# mining. biomaRt is the tool of choice. It's a package distributed by the
# bioconductor project. This here is not a biomaRt tutorial (that's for another
# day), simply a few lines of sample code to get you started on the specific use
# case of retrieving descriptions for ensembl protein IDs.
if (!require(biomaRt)) {
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")
library("biomaRt")
}
# define which dataset to use ...
myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
# what filters are defined?
filters <- listFilters(myMart)
filters
# and what attributes can we filter for?
attributes <- listAttributes(myMart)
attributes
# Soooo many options - let's look for the correct name of filters that are
# useful for ENSP IDs ...
filters[grep("ENSP", filters$description), ]
# ... and the correct attribute names for gene symbols and descriptions ...
attributes[grep("symbol", attributes$description, ignore.case=TRUE), ]
attributes[grep("description", attributes$description, ignore.case=TRUE), ]
# ... so we can put this together: here is a syntax example:
getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",
"phenotype_description"),
values = "ENSP00000000442",
mart = myMart)
# A simple loop will now get us the information for our 10 most central genes
# from the human subset of STRING.
CPdefs <- list() # Since we don't know how many matches one of our queries
# will return, we'll put the result dataframes into a list.
for (ID in ENSPsel) {
CPdefs[[ID]] <- getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",
"phenotype_description"),
values = ID,
mart = myMart)
}
# So what are the proteins with the ten highest betweenness centralities?
# ... are you surprised? (I am! Really.)
# Final task: Write a loop that will go through your list and
# for each ID:
# -- print the ID,
# -- print the first row's symbol, and
# -- print the first row's wikigene description.
#
# (Hint, you can structure your loop in the same way as the loop that
# created CPdefs. )
# Print the R code for your loop and its output for the ten genes onto a sheet
# of paper, write your student number and name on it, and bring this to class.
# = 1 Tasks
# [END]

199
BIN-SEQA-Comparison.R Normal file
View File

@ -0,0 +1,199 @@
# BIN-SEQA-Comparison.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-SEQA-Comparison unit
#
# Version: 0.1
#
# Date: 2017 08 25
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
# ==============================================================================
# PART THREE: Sequence Analysis
# ==============================================================================
if (!require(seqinr, quietly=TRUE)) {
install.packages("seqinr")
library(seqinr)
}
help(package = seqinr) # shows the available functions
# Let's try a simple function
?computePI
# This takes as input a vector of upper-case AA codes
# Let's retrieve the YFO sequence from our datamodel
# (assuming it is the last one that was added):
db$protein[nrow(db$protein), "sequence"]
# We can use the function strsplit() to split the string
# into single characters
s <- db$protein[nrow(db$protein), "sequence"]
s <- strsplit(s, "") # splitting on the empty spring
# splits into single characters
s <- unlist(s) # strsplit() returns a list! Why?
# (But we don't need a list now...)
# Alternatively, seqinr provides
# the function s2c() to convert strings into
# character vectors (and c2s to convert them back).
s <- s2c(db$protein[nrow(db$protein), "sequence"])
s
computePI(s) # isoelectric point
pmw(s) # molecular weight
AAstat(s) # This also plots the distribution of
# values along the sequence
# A true Labor of Love has gone into the
# compilation of the "aaindex" data:
?aaindex
data(aaindex) # "attach" the dataset - i.e. make it accessible as an
# R object
length(aaindex)
# Here are all the index descriptions
for (i in 1:length(aaindex)) {
cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
}
# Lets use one of the indices to calculate and plot amino-acid
# composition enrichment:
aaindex[[459]]
# === Sequence Composition Enrichment
#
# Let's construct an enrichment plot to compare one of the amino acid indices
# with the situation in our sequence.
refData <- aaindex[[459]]$I # reference frequencies in %
names(refData) <- a(names(refData)) # change names to single-letter
# code using seqinr's "a()" function
refData
# tabulate our sequence of interest and normalize
obsData <- table(s) # count occurrences
obsData = 100 * (obsData / sum(obsData)) # Normalize
obsData
len <- length(refData)
logRatio <- numeric() # create an empty vector
# loop over all elements of the reference, calculate log-ratios
# and store them in the vector
for (i in 1:len) {
aa <- names(refData)[i] # get the name of that amino acid
fObs <- obsData[aa] # retrieve the frequency for that name
fRef <- refData[aa]
logRatio[aa] <- log(fObs / fRef) / log(2) # remember log Ratio from
# the lecture?
}
barplot(logRatio)
# Sort by frequency, descending
logRatio <- sort(logRatio, decreasing = TRUE)
barplot(logRatio) # If you can't see all of the amino acid letters in the
# x-axis legend, make the plot wider by dragging the
# vertical pane-separator to the left
# label the y-axis
# (see text() for details)
label <- expression(paste(log[2],"( f(obs) / f(ref) )", sep = ""))
barplot(logRatio,
main = paste("AA composition enrichment"),
ylab = label,
cex.names=0.9)
# color the bars by type.
# define colors
chargePlus <- "#404580"
chargeMinus <- "#ab3853"
hydrophilic <- "#9986bf"
hydrophobic <- "#d5eeb1"
plain <- "#f2f7f7"
# Assign the colors to the different amino acid names
barColors <- character(len)
for (i in 1:length(refData)) {
AA <- names(logRatio[i])
if (grepl("[HKR]", AA)) {barColors[i] <- chargePlus }
else if (grepl("[DE]", AA)) {barColors[i] <- chargeMinus}
else if (grepl("[NQST]", AA)) {barColors[i] <- hydrophilic}
else if (grepl("[FAMILYVW]", AA)) {barColors[i] <- hydrophobic}
else barColors[i] <- plain
}
barplot(logRatio,
main = paste("AA composition enrichment"),
ylab = label,
col = barColors,
cex.names=0.9)
# draw a horizontal line at y = 0
abline(h=0)
# add a legend that indicates what the colours mean
legend (x = 1,
y = -1,
legend = c("charged (+)",
"charged (-)",
"hydrophilic",
"hydrophobic",
"plain"),
bty = "n",
fill = c(chargePlus,
chargeMinus,
hydrophilic,
hydrophobic,
plain)
)
# == TASK ==
# Interpret this plot. (Can you?)
#
#
# ==============================================================================
# [END]

683
BIN-Storing_data.R Normal file
View File

@ -0,0 +1,683 @@
# BIN-Storing_data.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Storing_data unit
#
# Version: 0.1
#
# Date: 2017 08 25
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
# ==============================================================================
# PART TWO: Implementing the (protein part of) the datamodel
# ==============================================================================
# Every entity in our datmodel can be implemented by a dataframe. To keep things
# organized, we create a list, that contains the enity tables. Here is a
# definition for a dataframe that holds the protein data for yeast Mbp1 as
# described in the data model schema:
#
# === Creating two tables ===
db <- list() # we'll call our database "db" and start with an empty list
# Then add a dataframe with one row to the list
db$protein <- data.frame(ID = as.integer(1),
name = "Mbp1",
RefSeqID = "NP_010227",
UniProtID = "P39678",
taxonomy.ID = as.integer(4932),
sequence = "...", # just a placeholder
stringsAsFactors = FALSE)
str(db)
# Next, we create the taxonomy table and add it into db
db$taxonomy <- data.frame(ID = as.integer(4932),
species = "Saccharomyces cerevisiae",
stringsAsFactors = FALSE)
# Let's add a second protein: We create a one-row dataframe with the data, then we rbind() it to the existing data frame:
myRow <- data.frame(ID = as.integer(2),
name = "Res2",
RefSeqID = "NP_593032",
UniProtID = "P41412",
taxonomy.ID = as.integer(4896),
sequence = "...", # again, just a placeholder
stringsAsFactors = FALSE)
db$protein <- rbind(db$protein, myRow)
myRow <- data.frame(ID = as.integer(4896),
species = "Schizosaccharomyces pombe",
stringsAsFactors = FALSE)
db$taxonomy <- rbind(db$taxonomy, myRow)
str(db)
# you can look at the contents of the tables in the usual way we would access
# elements from lists and dataframes. Here are some examples:
db$protein
db$protein$RefSeqID
db$protein[,"name"]
db$taxonomy
db$taxonomy$species
biCode(db$taxonomy$species)
# Here is an example to look up information in one table,
# based on a condition in another table:
# what is the species name for the protein
# whose name is "Mbp1"?
# First, get the taxonomy.ID for the Mbp1 protein. This is
# the key we need for the taxonomy table. We find it in a cell in the
# table: db$protein[<row>, <column>]
# <row> is that row for which the value in
# the "name" column is Mbp1:
db$protein$name == "Mbp1" # TRUE FALSE
# The <column> is called "taxonomy.ID". Simply
# insert these two expressions in the square
# brackets.
db$protein[db$protein$name == "Mbp1", "taxonomy.ID"]
# Assign the taxonomy.ID value ...
x <- db$protein[db$protein$name == "Mbp1", "taxonomy.ID"]
# ... and fetch the species_name value from db$taxonomy
db$taxonomy[db$taxonomy$ID == x, "species"]
# === Updating the database ====================================================
# Modifying a field
# Here is code that modifies the sequence field in the protein table of the
# database:
#
mySeq <- "
1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk
61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha
121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr
181 klgvnlqrsq sdmgfprpai pnssisttql psirstmgpq sptlgileee rhdsrqqqpq
241 qnnsaqfkei dledglssdv epsqqlqqvf nqntgfvpqq qssliqtqqt esmatsvsss
301 pslptspgdf adsnpfeerf pgggtspiis miprypvtsr pqtsdindkv nkylsklvdy
361 fisnemksnk slpqvllhpp phsapyidap idpelhtafh wacsmgnlpi aealyeagts
421 irstnsqgqt plmrsslfhn sytrrtfpri fqllhetvfd idsqsqtvih hivkrksttp
481 savyyldvvl skikdfspqy rielllntqd kngdtalhia skngdvvffn tlvkmgaltt
541 isnkegltan eimnqqyeqm miqngtnqhv nssntdlnih vntnnietkn dvnsmvimsp
601 vspsdyityp sqiatnisrn ipnvvnsmkq masiyndlhe qhdneikslq ktlksisktk
661 iqvslktlev lkesskdeng eaqtnddfei lsrlqeqntk klrkrliryk rlikqkleyr
721 qtvllnklie detqattnnt vekdnntler lelaqeltml qlqrknklss lvkkfednak
781 ihkyrriire gtemnieevd ssldvilqtl iannnknkga eqiitisnan sha
//
"
str(db$protein) # before
db$protein$sequence[db$protein$name == "Mbp1"] <- dbSanitizeSequence(mySeq)
str(db$protein) # after
# Analyze the expression ! Note how we specify an element of a vector (column)
# in a data frame in a list using a logical expression. And note how we assign
# the output (return value) of a function. As far as database coding goes this
# is pretty minimal - there is no error checking done at all. In particular: can
# we really guarantee that the name "Mbp1" is unique in the protein table? No!
# We never required it to be unique. This is a check we need to perform so
# frequently that we will encapsulate it in a function:
dbConfirmUnique
# try this:
dbConfirmUnique(c("TRUE", "FALSE"))
dbConfirmUnique(c(TRUE, FALSE))
dbConfirmUnique(c(TRUE, TRUE))
dbConfirmUnique(c(FALSE, FALSE))
#
#
# Here is the update to the sequence field of Res2 but using our
# confirmUnique() function
mySeq <- "
1 maprssavhv avysgvevye cfikgvsvmr rrrdswlnat qilkvadfdk pqrtrvlerq
61 vqigahekvq ggygkyqgtw vpfqrgvdla tkykvdgims pilsldideg kaiapkkkqt
121 kqkkpsvrgr rgrkpsslss stlhsvnekq pnssisptie ssmnkvnlpg aeeqvsatpl
181 paspnallsp ndntikpvee lgmleapldk yeeslldffl hpeegripsf lyspppdfqv
241 nsvidddght slhwacsmgh iemiklllra nadigvcnrl sqtplmrsvi ftnnydcqtf
301 gqvlellqst iyavdtngqs ifhhivqsts tpskvaaaky yldcilekli siqpfenvvr
361 lvnlqdsngd tslliaarng amdcvnslls ynanpsipnr qrrtaseyll eadkkphsll
421 qsnsnashsa fsfsgispai ispscsshaf vkaipsissk fsqlaeeyes qlrekeedli
481 ranrlkqdtl neisrtyqel tflqknnpty sqsmenlire aqetyqqlsk rlliwlearq
541 ifdlerslkp htslsisfps dflkkedgls lnndfkkpac nnvtnsdeye qlinkltslq
601 asrkkdtlyi rklyeelgid dtvnsyrrli amscginped lsleildave ealtrek
"
db$protein$sequence[dbConfirmUnique(db$protein$name == "Res2")] <- dbSanitizeSequence(mySeq)
str(db$protein)
# These expressions can get rather long, it's easier to read if we write:
select <- dbConfirmUnique(db$protein$name == "Res2")
value <- dbSanitizeSequence(mySeq)
db$protein$sequence[select] <- value
# ... and that's the code paradigm that we will adopt to update
# database fields (for now).
# Adding a record
# Adding a record is easy in principle, simply defining the values we get
# from the NCBI or EBI databases ... except for the ID field. That is a field
# we need to define internally, and once again, we'll use a small function
# to get this right.
dbAutoincrement
# After reading the code, predict the results
dbAutoincrement(1) # Hm.
dbAutoincrement(1L)
dbAutoincrement(1:4)
dbAutoincrement(c(1, 2, 3, 4))
dbAutoincrement(c(1L, 2L, 3L, 4L))
dbAutoincrement(c(1, "4", 9))
dbAutoincrement(TRUE)
# Therefore, here is sample code to add one entry to the protein table.
#
mySeq <- "
1 msgdktifka tysgvpvyec iinnvavmrr rsddwlnatq ilkvvgldkp qrtrvlerei
61 qkgihekvqg gygkyqgtwi pldvaielae ryniqgllqp itsyvpsaad spppapkhti
121 stsnrskkii padpgalgrs rratsietes evigaapnnv segsmspsps dissssrtps
181 plpadrahpl hanhalagyn grdannhary adiildyfvt enttvpslli npppdfnpdm
241 sidddehtal hwacamgrir vvklllsaga difrvnsnqq talmratmfs nnydlrkfpe
301 lfellhrsil nidrndrtvf hhvvdlalsr gkphaaryym etminrlady gdqladilnf
361 qddegetplt maararskrl vrlllehgad pkirnkegkn aedyiieder frsspsrtgp
421 agielgadgl pvlptsslht seagqrtagr avtlmsnllh sladsydsei ntaekkltqa
481 hgllkqiqte iedsakvaea lhheaqgvde erkrvdslql alkhainkra rddlerrwse
541 gkqaikrarl qaglepgals tsnatnapat gdqkskddak sliealpagt nvktaiaelr
601 kqlsqvqank telvdkfvar areqgtgrtm aayrrliaag cggiapdevd avvgvlcell
661 qeshtgarag aggerddrar dvammlkgag aaalaanaga p
"
myRow <- data.frame(ID = dbAutoincrement(db$protein$ID),
name = "UMAG_1122",
RefSeqID = "XP_011392621",
UniProtID = "A0A0D1DP35",
taxonomy.ID = as.integer(5270),
sequence = dbSanitizeSequence(mySeq),
stringsAsFactors = FALSE)
db$protein <- rbind(db$protein, myRow)
myRow <- data.frame(ID = as.integer(5270),
species = "Ustilago maydis",
stringsAsFactors = FALSE)
db$taxonomy <- rbind(db$taxonomy, myRow)
# Here is a bit of code template for the same ... it won't execute of course but
# you can copy it into your own code script file to modify when you need to add
# your own protein.
# ===== begin code template =====================================
mySeq <- "
RAW SEQUENCE FROM NCBI RECORD
"
myRow <- data.frame(ID = dbAutoincrement(db$protein$ID),
name = "NAME",
RefSeqID = "REFSEQ",
UniProtID = "UNIPROT",
taxonomy.ID = as.integer(TAX_ID),
sequence = dbSanitizeSequence(mySeq),
stringsAsFactors = FALSE)
db$protein <- rbind(db$protein, myRow)
myRow <- data.frame(ID = as.integer(TAX_ID),
species = "BINOMIAL NAME",
stringsAsFactors = FALSE)
db$taxonomy <- rbind(db$taxonomy, myRow)
# ===== end code template ===========================================
# deleting a record
# This is simple code without error checking and of course we can make mistakes.
# Often we can just overwrite the offending field with correct data. But
# sometimes it will be easier (and more robust) to delete the erroneous entry
# and add the correct one. For example, if your code is in a script, and you
# realize the entry had an error, I would not "patch" the error in the script
# but delete the entry on the console command line, correct the script and
# execute the correct block. That way we fix the mistake at its source.
#
# Removing a row from a datframe is trivial: just overwrite the dataframe with
# a selection statement in which the unique selection of the offending row is
# inverted:
# create an erroneous entry
myRow <- data.frame(ID = dbAutoincrement(db$protein$ID),
name = "nonesuch",
RefSeqID = "NP_000000",
UniProtID = "A123456",
taxonomy.ID = as.integer(999999),
sequence = dbSanitizeSequence("utter nonsense"),
stringsAsFactors = FALSE)
db$protein <- rbind(db$protein, myRow)
# Make a logical vector that identifies it
select <- dbConfirmUnique(db$protein$name == "nonesuch")
select # the selection
!select # its logical inversion
str(db) # before
db$protein <- db$protein[ ! select, ] # overwrite the table with a copy
# without the selected record
str(db) # after
# Note: if you delete records "by hand" you need to be careful that you do
# not remove keys that are used as foreign keys in another table - if there
# are such dependecies, you need to update the other table(s) too. "Real"
# database systems include such dependencies in the creation instructions
# of the table schema: "on delete cascade ..."
#
# ==============================================================================
# ==============================================================================
# PART ONE: Database maintenance
# ==============================================================================
# === Merging databases ========================================================
# From time to time, we will want to update the systems database - either
# because the schema has changed, e.g. we may include additional tables, or
# because I have added reference data: sequences, annotations and the like. The
# goal here is to make this as simple as possible, keeping all information you
# may have entered intact, and avoiding to duplicate entries. This is not quite
# as easy as one would hope. First of all, since we need to merge records from
# different databases - your own, and the reference - we can't assume that the
# primary keys are unique. You may have introduced the same proteinID that I
# used later for the reference database. Thus we need to update keys. But if we
# update primary keys, we need to make sure that every record that references a
# key as a foreign key is updated as well. And ensuring database integrity for
# "cascading updates" takes a _lot_ of care to guarantee correctness. You don't
# want to read through that code. I don't want to write it. And we should not
# end up pretending to build a real and proper database engine here!
#
# We'll do something simple instead. The sanest way is to create
# primary keys with "namespaces", ie. the keys will reflect the source of the
# information. This way, even though the databases are merged, the information
# sources can be identified. Still, it can be accessed through a common
# mechanism. A database merge with separate namespaces can simply proceed as
# follows:
#
# for each table in the database
# merge the table from both source databases
# throw out any row with a duplicate ID
#
# In this way, if we merge your working database with the reference database
# all reference database records will be updated, and all your working records
# will remain.
# And while we are adding semnatics to keys, we should also identify in which
# table they are being used. You may remember that I told you: keys should not
# try to express semantics? Here we have a slightly different situation: since
# we are not actually constructing an underlying database engine, we will often
# encounter keys that we use "manually". And in that case it is very helpful to
# make the key's domain explicit.
# The code I wrote to implement this was loaded when you typed init().
# Let's have a brief look at the new dbAutoincrement() function. You'll see the
# code as usual when you execute the function name without its parentheses:
dbAutoincrement
# The default namespace is "my" - that's the namespace you use. I will use
# "ref" for all entries of the reference database. Thus you normally won't
# need to enter the namespace identifier - only I do that.
refDB$protein$ID
dbAutoincrement(refDB$protein$ID)
dbAutoincrement(refDB$protein$ID, ns = "ref")
# Once the namespaces are thus kept well apart, we won't overwrite each other's
# work. Merging two table becomes a breeze:
tableUnion
# Here is an example
this <- data.frame(ID = "ref_1", type = "one crow", stringsAsFactors = FALSE)
this <- rbind(this, data.frame(ID = dbAutoincrement(this$ID, ns="ref"),
type = "for",
stringsAsFactors = FALSE))
this <- rbind(this, data.frame(ID = dbAutoincrement(this$ID, ns="ref"),
type = "sorrow",
stringsAsFactors = FALSE))
that <- data.frame(ID = "my_1", type = "two crows", stringsAsFactors = FALSE)
that <- rbind(that, this[2:3, ]) # repeat some rows
that <- rbind(that, data.frame(ID = dbAutoincrement(that$ID),
type = "for ",
stringsAsFactors = FALSE))
that <- rbind(that, data.frame(ID = dbAutoincrement(that$ID),
type = "joy ...",
stringsAsFactors = FALSE))
that
this
rhyme <- tableUnion(this, that)
rhyme$type
# Finally, we need to do this for all tables in the datamodel. Well, almost all:
# we introduce a field $version, through which we can ensure that the datamodels
# have the same schema. If they would have different schemas the merging would
# break. But we don't merge this single field. (Incidentally: a schema version
# is not "data", rather we call this "metadata": information _about_ the data.)
# So we exclude the "version" from the names of elements to merge.
#
# But how to iterate over all tables in a list by name? You might be tempted to
# try something like
#
# n <- names(db)
# myDB$n[1], myDB$n[2] etc.
# ... but NO! That's not how the $ operator works.
#
# The R community keeps particularly poignant comments from the R-help mailing
# list in a package called "fortunes", and fortune(312) reads:
#
# " The problem here is that the $ notation is a magical shortcut and like any
# other magic if used incorrectly is likely to do the programmatic equivalent
# of turning yourself into a toad.
#
# -- Greg Snow (in response to a user that wanted
# to access a column whose name is stored in y
# via x$y rather than x[[y]])
# R-help (February 2012)
# So in order to avoid turning into toads, we create a vector "tables", iterate
# over "table" elements from "tables" and use them as ref[[table]] ... etc.
# === Putting data into the new schema =========================================
# Entering the YFO data into the new schema takes almost exactly
# the same syntax as the code you wrote for the last assignment.
#
# === TASK: ===
# Initialize myDB with the YFO homologue of yeast Mbp1
# First: initialize "myDB", this is the name of the R object that
# will contain data you collect.
myDB <- dbInit() # This creates an empty database with the latest schema
# Next: Add your protein and the YFO to the database. Copy the code-template
# below to your myCode.R file, edit it to replace the placeholder
# items with your data, and execute the code. Then continue below
# the code template.
# ===== begin code template: add a protein and an organism to the database =====
# == edit placeholder items!
myBinomial <- "<BINOMIAL NAME>" # Name, NOT biCode()
myTaxonomyId <- as.integer(<TAX_ID>)
myProteinName <- "<PROTEIN NAME>" # Name your protein "MBP1_<YFO>" (where <YFO>
# is a placeholder for the biCode() of YFO)
myProteinRefSeqID <- "<REFSEQID>"
myProteinUniProtID <- "<UNIPROTID>"
mySeq <- "
<SEQUENCE>
"
# == create the protein entry
proteinRow <- data.frame(ID = dbAutoincrement(myDB$protein$ID),
name = myProteinName,
RefSeqID = myProteinRefSeqID,
UniProtID = myProteinUniProtID,
taxonomy.ID = myTaxonomyId,
sequence = dbSanitizeSequence(mySeq),
stringsAsFactors = FALSE)
myDB$protein <- rbind(myDB$protein, proteinRow)
# == create the taxonomy entry
taxonomyRow <- data.frame(ID = myTaxonomyId,
species = myBinomial,
stringsAsFactors = FALSE)
myDB$taxonomy <- rbind(myDB$taxonomy, taxonomyRow)
# ===== end code template ===========================================
# ... continue here.
# myDB now contains one record each in two tables. The remaining tables exist
# but they are empty. Check that all the values are correct: just execute
myDB
# Now let's merge myDB with the data from refDB. refDB should already have been
# loaded from .utilities.R ; you can also explore the original script with which
# refDB was created, for your reference: it is create_refDB.R The database
# refDB is the default argument for dbMerge(), so you don't need to
# specify it. By assigning the result of dbMerge() back to myDB we overwrite the
# previous version.
myDB <- dbMerge(myDB)
str(myDB)
# check the protein table
View(myDB$protein[ , c("ID", "name", "RefSeqID")])
# Is your protein named according to the pattern "MBP1_<YFO>"? It should be.
# And does the taxonomy table contain the binomial name? It should be the same
# that you get when you type YFO into the console.
# Let's compute sequence lengths on the fly (with the function nchar() ) and
# add them to our view. Then we'll open this with the table viewer function
# View()
View(cbind(myDB$protein[ , c("ID", "name", "RefSeqID")],
length = nchar(myDB$protein$sequence)))
# Where does your protein's length fall relative to the reference proteins?
# About the same? Much shorter? Much longer?
# Is that the right sequence?
sel <- myDB$protein$ID == "my_pro_1"
myDB$protein$sequence[sel]
# If not, don't continue! Fix the problem first.
# Let me repeat: If this does not give you the right sequence of the YFO
# Mbp1 homologue, DO NOT CONTINUE. Fix the problem.
# Is that the right taxonomy ID and binomial name for YFO?
sel <- myDB$taxonomy$species == YFO
myDB$taxonomy[sel, ]
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
# Fix the problem first.
# Does this give you the right protein ID for MBP1_<YFO>?
sel <- dbConfirmUnique(myDB$protein$name == paste("MBP1_", biCode(YFO), sep = ""))
myDB$protein$ID[sel]
# If not, or if the result was "<0 rows> ... " then DO NOT CONTINUE.
# Fix the problem first.
#
#
# === Saving and loading data ==================================================
#
# Once you have confirmed that myDB has been successfully created and updated
# and is in a good state, you should make a backup copy. There are many ways to
# save data to a file on disk and read it back in. One of the most convenient is
# the function pair save() and load().
#
# save() saves an R object to a file. Its signature is
# save(<object-name/s>, file = <file-name>)
# The object, or objects that are saved like this are identically recreated when
# load() is executed. The object name is not changed - you don't assign the
# result of load(). You use load() for its "side-effect": re-creating the saved
# object, not for using the return-value. Therefore the signature for load() is
# simply
# load(<file-name>)
# All R objects in <file-name> are created by load(), if they don't yet exist.
# If they already exist, the will be overwritten. The only effect you see is
# that the object appears in the Environment pane of RStudio (if it wasn't there
# already), or you may notice that its content has changed if it did exist.
# == TASK: ==
# Save myDB so you can recreate it easily when you next open RStudio.
save(myDB, file = "myDB.01.RData") # Note that I give this a version number!
# Let's confirm:
rm(myDB)
nrow(myDB$protein) # must give an error
load("myDB.01.RData")
nrow(myDB$protein) # must be 11. If not, don't continue etc.
===New Database ===
Here is some sample code to work with the new database, enter new protein data for YFO, save it and load it again when needed.
<source lang="R">
# You don't need to load the reference database refDB. If
# everything is set up correctly, it gets loaded at startup.
# (Just so you know: you can turn off that behaviour if you
# ever should want to...)
# First you need to load the newest version of dbUtilities.R
updateDButilities("7bb32ab3d0861ad81bdcb9294f0f6a737b820bf9")
# If you get an error:
# Error: could not find function "updateDButilities"
# ... then it seems you didn't do the previous update.
# Try getting the update with the new key but the previous function:
# updateDbUtilities()
#
# If that function is not found either, confirm that your ~/.Rprofile
# actually loads dbUtilites.R from your project directory.
# As a desperate last resort, you could uncomment
# the following piece of code and run the update
# without verification...
#
# URL <- "http://steipe.biochemistry.utoronto.ca/abc/images/f/f9/DbUtilities.R"
# download.file(URL, paste(PROJECTDIR, "dbUtilities.R", sep="")), method="auto")
# source(paste(PROJECTDIR, "dbUtilities.R", sep=""))
#
# But be cautious: there is no verification. You yourself need
# to satisfy yourself that this "file from the internet" is what
# it should be, before source()'ing it...
# After the file has been source()'d, refDB exists.
ls(refDB)
# check the contents of refDB:
refDB$protein$name
refDB$taxonomy
# list refSeqIDs for saccharomyces cerevisiae genes.
refDB$protein[refDB$protein$taxID == 559292, "refSeqID"]
# To add some genes from YFO, I proceed as follows.
# Obviously, you need to adapt this to your YFO
# and the sequences in YFO that you have found
# with your PSI-BLAST search.
# Let's assume my YFO is the fly agaric (amanita muscaria)
# and its APSES domain proteins have the following IDs
# (these are not refSeq btw. and thus unlikely
# to be found in UniProt) ...
# KIL68212
# KIL69256
# KIL65817
#
# First, I create a copy of the database with a name that
# I will recognize to be associated with my YFO.
amamuDB <- refDB
# Then I fetch my protein data ...
tmp1 <- fetchProteinData("KIL68212")
tmp2 <- fetchProteinData("KIL69256")
tmp3 <- fetchProteinData("KIL65817")
# ... and if I am satisfied that it contains what I
# want, I add it to the database.
amamuDB <- addToDB(amamuDB, tmp1)
amamuDB <- addToDB(amamuDB, tmp2)
amamuDB <- addToDB(amamuDB, tmp3)
# Then I make a local backup copy. Note the filename and
# version number :-)
save(amamuDB, file="amamuDB.01.RData")
# Now I can explore my new database ...
amamuDB$protein[amamuDB$protein$taxID == 946122, "refSeqID"]
# ... but if anything goes wrong, for example
# if I make a mistake in checking which
# rows contain taxID 946122 ...
amamuDB$protein$taxID = 946122
# Ooops ... what did I just do wrong?
# ... wnat happened instead?
amamuDB$protein$taxID
# ... I can simply recover from my backup copy:
load("amamuDB.01.RData")
amamuDB$protein$taxID
# [END]

68
BIN-YFO.R Normal file
View File

@ -0,0 +1,68 @@
# BIN-YFO.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-YFO unit
#
# Version: 0.1
#
# Date: 2017 08 25
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
# ==============================================================================
# PART ONE: YFO Species
# ==============================================================================
# There are some "rabbitholes" that you are encouraged to follow to explore the code that went into generating the YFO species list. The minimal required result however is that you have picked an '''YFO''', and that its name got written into your personalized profile file.
# A detailed description of the process of compiling the YFO list of genome
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
# ABC_makeYFOlist.R I encourage you to study it. Here we load the
# resulting vector of species name, then pick one of them in a random but
# reproducible way, determined by your student number.
load("data/YFOspecies.RData") # load the species names
set.seed(myStudentNumber) # seed the random number generator
YFO <- sample(YFOspecies, 1) # pick a species at random
# write the result to your personalized profile data so we can use the result in
# other functions
cat(sprintf("YFO <- \"%s\"\n", YFO), file = ".myProfile.R", append = TRUE)
YFO # so, which species is it ... ?
biCode(YFO) # and what is it's "BiCode" ... ?
# Note down the species name and its five letter label on your Student Wiki user page. '''Use this species whenever this or future assignments refer to YFO'''.
#
#
# ==============================================================================
# [END]

View File

@ -0,0 +1,638 @@
# FND-MAT-Graphs_and_networks.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-MAT-Graphs_and_networks unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# This tutorial covers basic concepts of graph theory and analysis in R. You
# should have typed init() to configure some utilities in the background.
# ==============================================================================
# PART ONE: REVIEW
# ==============================================================================
# I assume you'll have read the Pavlopoulos review of graph theory concepts.
# Let's explore some of the ideas by starting with a small random graph."
# To begin let's write a little function that will create random "gene" names;
# there's no particular purpose to this other than to make our graphs look a
# little more like what we would find in a publication ...
makeRandomGenenames <- function(N) {
nam <- character()
while (length(nam) < N) {
a <- paste(c(sample(LETTERS, 1), sample(letters, 2)),
sep="", collapse="") # three letters
n <- sample(1:9, 1) # one number
nam[length(nam) + 1] <- paste(a, n, sep="") # store in vector
nam <- unique(nam) # delete if this was a duplicate
}
return(nam)
}
N <- 20
set.seed(112358)
Nnames <- makeRandomGenenames(N)
Nnames
# One way to represent graphs in a computer is as an "adjacency matrix". In this
# matrix, each row and each column represents a node, and the cell at the
# intersection of a row and column contains a value/TRUE if there is an edge,
# 0/FALSE otherwise. It's easy to see that an undirected graph has a symmetric
# adjacency matrix (i, j) == (j, i); and we can put values other than {1, 0}
# into a cell if we want to represent a weighted edge.
# At first, lets create a random graph: let's say a pair of nodes has
# probability p <- 0.1 to have an edge, and our graph is symmetric and has no
# self-edges. We use our Nnames as node labels, but I've written the function so
# that we could also just ask for any number of un-named nodes, we'll use that later.
makeRandomGraph <- function(nam, p = 0.1) {
# nam: either a character vector of unique names, or a single
# number that will be converted into a vector of integers.
# p: probability that a random pair of nodes will have an edge.
#
# Value: an adjacency matrix
#
if (is.numeric(nam) && length(nam) == 1) { # if nam is a single number ...
nam <- as.character(1:nam)
}
N <- length(nam)
G <- matrix(numeric(N * N), ncol = N) # The adjacency matrix
rownames(G) <- nam
colnames(G) <- nam
for (iRow in 1:(N-1)) { # Note how we make sure iRow != iCol
for (iCol in (iRow+1):N) {
if (runif(1) < p) { # runif() creates uniform random numbers
# between 0 and 1
G[iRow, iCol] <- 1 # row, col !
G[iCol, iRow] <- 1 # col, row !
}
}
}
return(G)
}
set.seed(112358)
G <- makeRandomGraph(Nnames, p = 0.09)
G
# Listing the matrix is not very informative - we should plot this graph. We'll
# go into more details of the igraph package a bit later, for now we just use it
# to plot:
if (!require(igraph)) {
install.packages("igraph")
library(igraph)
}
iG <- graph_from_adjacency_matrix(G)
iGxy <- layout_with_graphopt(iG, charge=0.001) # calculate layout coordinates
# The igraph package adds its own function to the collection of plot()
# functions; R makes the selection which plot function to use based on the class
# of the object that we request to plot. This plot function has parameters
# layout - the x,y coordinates of the nodes;
# vertex.color - which I define to color by node-degree
# vertex size - which I define to increase with node-degree
# vertex.label - which I set to use our Nnames vector
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(iG,
layout = iGxy,
rescale = FALSE,
xlim = c(min(iGxy[,1]), max(iGxy[,1])) * 1.1,
ylim = c(min(iGxy[,2]), max(iGxy[,2])) * 1.1,
vertex.color=heat.colors(max(degree(iG)+1))[degree(iG)+1],
vertex.size = 800 + (150 * degree(iG)),
vertex.label = as.character(degree(iG)/2),
# vertex.label = Nnames,
edge.arrow.size = 0)
par(oPar) # reset plot window
# The simplest descriptor of a graph are the number of nodes, edges, and the
# degree-distribution. In our example, the number of nodes was given: N; the
# number of edges can easily be calculated from the adjacency matrix. In our
# matrix, we have entered 1 for every edge. Thus we simply sum over the matrix:
sum(G)
# Is that correct? Is that what you see in the plot?
# Yes and no: we entered every edge twice: once for a node [i,j], and again for
# the node [j, i]. Whether that is correct depends on what exactly we
# want to do with the matrix. If these were directed edges, we would need to
# keep track of them separately. Since we didn't intend them to be directed,
# we'll could divide the number of edges by 2. Why didn't we simply use an
# upper-triangular matrix? Because then we need to keep track of the ordering of
# edges if we want to know whether a particular edge exists or not. For example
# we could sort the nodes alphabetically, and make sure we always query a pair
# in alphabetical order. Then a triangular matrix would be efficient.
# What about the degree distribution? We can get that simply by summing over the
# rows (or the columns):"
rowSums(G) # check this against the plot!
# Let's plot the degree distribution in a histogram:
rs <- rowSums(G)
brk <- seq(min(rs)-0.5, max(rs)+0.5, by=1) # define breaks for the histogram
hist(rs, breaks=brk, col="#A5CCF5",
xlim = c(-1,8), xaxt = "n",
main = "Node degrees", xlab = "Degree", ylab = "Number") # plot histogram
axis(side = 1, at = 0:7)
# Note: I don't _have_ to define breaks, the hist() function usually does so
# quite well, automatically. But for this purpose I want the columns of the
# histogram to represent exactly one node-degree difference.
# A degree distribution is actually quite an important descriptor of graphs,
# since it is very sensitive to the generating mechanism. For biological
# networks, that is one of the key questions we are interested in: how was the
# network formed?
# ==============================================================================
# PART TWO: DEGREE DISTRIBUTIONS
# ==============================================================================
# Let's simulate a few graphs that are a bit bigger to get a better sense of
# their degree distributions:
#
# === random graph
set.seed(31415927)
G200 <- makeRandomGraph(200, p = 0.015)
iG200 <- graph_from_adjacency_matrix(G200)
iGxy <- layout_with_graphopt(iG200, charge=0.0001) # calculate layout coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(iG200,
layout = iGxy,
rescale = FALSE,
xlim = c(min(iGxy[,1]), max(iGxy[,1])) * 1.1,
ylim = c(min(iGxy[,2]), max(iGxy[,2])) * 1.1,
vertex.color=heat.colors(max(degree(iG200)+1))[degree(iG200)+1],
vertex.size = 200 + (30 * degree(iG200)),
vertex.label = "",
edge.arrow.size = 0)
par(oPar)
# This graph has thirteen singletons and one large, connected component. Many
# biological graphs look approximately like this.
# Calculate degree distributions
dg <- degree(iG200)/2 # here, we use the iGraph function degree()
# not rowsums() from base R.
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
hist(dg, breaks=brk, col="#A5CCF5",
xlim = c(-1,11), xaxt = "n",
main = "Node degrees", xlab = "Degree", ylab = "Number") # plot histogram
axis(side = 1, at = 0:10)
# Note the characteristic peak of this distribution: this is not "scale-free". Here is a log-log plot of frequency vs. degree-rank:
(freqRank <- table(dg))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
pch = 21, bg = "#A5CCF5",
xlab = "log(Rank)", ylab = "log(frequency)",
main = "200 nodes in a random network")
# === scale-free graph (Barabasi-Albert)
# What does one of those intriguing "scale-free" distributions look like? The
# iGraph package has a function to make random graphs according to the
# Barabasi-Albert model of scale-free graphs. It is: sample_pa(), where pa
# stands for "preferential attachment", one type of process that will yield
# scale-free distributions.
set.seed(31415927)
GBA <- sample_pa(200, power = 0.8)
iGxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(GBA,
layout = iGxy,
rescale = FALSE,
xlim = c(min(iGxy[,1]), max(iGxy[,1])) * 1.1,
ylim = c(min(iGxy[,2]), max(iGxy[,2])) * 1.1,
vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
vertex.size = 200 + (30 * degree(GBA)),
vertex.label = "",
edge.arrow.size = 0)
par(oPar)
# This is a very obviously different graph! Some biological networks have
# features that look like that - but in my experience the hub nodes are usually
# not that distinct. But then again, that really depends on the parameter
# "power". Feel encouraged to change "power" and get a sense for what difference
# this makes. Also: note that the graph has only a single component.
# What's the degree distribution of this graph?
(dg <- degree(GBA))
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
hist(dg, breaks=brk, col="#A5D5CC",
xlim = c(0,30), xaxt = "n",
main = "Node degrees 200 nodes PA graph",
xlab = "Degree", ylab = "Number")
axis(side = 1, at = seq(0, 30, by=5))
# Most nodes have a degree of 1, but one node has a degree of 28.
(freqRank <- table(dg))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
pch = 21, bg = "#A5F5CC",
xlab = "log(Rank)", ylab = "log(frequency)",
main = "200 nodes in a preferential-attachment network")
# Sort-of linear, but many of the higher ranked nodes have a frequency of only
# one. That behaviour smooths out in larger graphs:
#
X <- sample_pa(100000, power = 0.8) # 100,000 nodes
freqRank <- table(degree(X))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
xlab = "log(Rank)", ylab = "log(frequency)",
pch = 21, bg = "#A5F5CC",
main = "100,000 nodes in a random, scale-free network")
rm(X)
# === Random geometric graph
# Finally, let's simulate a random geometric graph and look at the degree
# distribution. Remember: these graphs have a high probability to have edges
# between nodes that are "close" together - an entriely biological notion.
# We'll randomly place our nodes in a box. Then we'll define the
# probability for two nodes to have an edge to be a function of their distance.
# Here is a function that makes such graphs. iGraph has sample_grg(), which
# connects nodes that are closer than a cutoff, the function I give you below is
# a bit more interesting since it creates edges according to a probability that
# is determined by a generalized logistic function of the distance. This
# sigmoidal function gives a smooth cutoff and creates more "natural" graphs.
# Otherwise, the function is very similar to the random graph function, except
# that we output the "coordinates" of the nodes together with the adjacency
# matrix. Lists FTW.
#
makeRandomGeometricGraph <- function(nam, B = 25, Q = 0.001, t = 0.6) {
# nam: either a character vector of unique names, or a single
# number that will be converted into a vector of integers.
# B, Q, t: probability that a random pair (i, j) of nodes gets an
# edge determined by a generalized logistic function
# p <- 1 - 1/((1 + (Q * (exp(-B * (x-t)))))^(1 / 0.9)))
#
# Value: a list with the following components:
# G$mat : an adjacency matrix
# G$nam : labels for the nodes
# G$x : x-coordinates for the nodes
# G$y : y-coordinates for the nodes
#
nu <- 1 # probably not useful to change
G <- list()
if (is.numeric(nam) && length(nam) == 1) {
nam <- as.character(1:nam)
}
G$nam <- nam
N <- length(G$nam)
G$mat <- matrix(numeric(N * N), ncol = N) # The adjacency matrix
rownames(G$mat) <- G$nam
colnames(G$mat) <- G$nam
G$x <- runif(N)
G$y <- runif(N)
for (iRow in 1:(N-1)) { # Same principles as in makeRandomGraph()
for (iCol in (iRow+1):N) {
# geometric distance ...
d <- sqrt((G$x[iRow] - G$x[iCol])^2 +
(G$y[iRow] - G$y[iCol])^2) # Pythagoras
# distance dependent probability
p <- 1 - 1/((1 + (Q * (exp(-B * (d-t)))))^(1 / nu))
if (runif(1) < p) {
G$mat[iRow, iCol] <- 1
G$mat[iCol, iRow] <- 1
}
}
}
return(G)
}
# Getting the parameters of a generalized logistic right takes a bit of
# experimenting. If you are interested, you can try a few variations. Or you can
# look up the function at
# https://en.wikipedia.org/wiki/Generalised_logistic_function
# This function computes generalized logistics ...
# genLog <- function(x, B = 25, Q = 0.001, t = 0.5) {
# # generalized logistic (sigmoid)
# nu <- 1
# return(1 - 1/((1 + (Q * (exp(-B * (x-t)))))^(1 / nu)))
# }
#
# ... and this code plots p-values over the distances we could encouter between
# our nodes: from 0 to sqrt(2) i.e. the diagonal of the unit sqaure in which we
# will place our nodes.
# x <- seq(0, sqrt(2), length.out = 50)
# plot(x, genLog(x), type="l", col="#AA0000", ylim = c(0, 1),
# xlab = "d", ylab = "p(edge)")
# 200 node random geomteric graph
set.seed(112358)
GRG <- makeRandomGeometricGraph(200, t=0.4)
iGRG <- graph_from_adjacency_matrix(GRG$mat)
iGRGxy <- cbind(GRG$x, GRG$y) # use our node coordinates for layout
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(iGRG,
layout = iGRGxy,
rescale = FALSE,
xlim = c(min(iGRGxy[,1]), max(iGRGxy[,1])) * 1.1,
ylim = c(min(iGRGxy[,2]), max(iGRGxy[,2])) * 1.1,
vertex.color=heat.colors(max(degree(iGRG)+1))[degree(iGRG)+1],
vertex.size = 0.1 + (0.1 * degree(iGRG)),
vertex.label = "",
edge.arrow.size = 0)
par(oPar)
# degree distribution:
(dg <- degree(iGRG)/2)
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
hist(dg, breaks=brk, col="#FCD6E2",
xlim = c(0, 25), xaxt = "n",
main = "Node degrees: 200 nodes RG graph",
xlab = "Degree", ylab = "Number")
axis(side = 1, at = c(0, min(dg):max(dg)))
# You'll find that this is kind of in-between the random, and the scale-free
# graph. We do have hubs, but they are not as extreme as in the scale-free case;
# and we have have no singletons, in contrast to the random graph.
(freqRank <- table(dg))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
pch = 21, bg = "#FCD6E2",
xlab = "log(Rank)", ylab = "log(frequency)",
main = "200 nodes in a random geometric network")
# ====================================================================
# PART THREE: A CLOSER LOOK AT THE igraph PACKAGE
# ====================================================================
# == BASICS ==========================================================
# The basic object of the igraph package is a graph object. Let's explore the
# first graph some more, the one we built with our random gene names:
summary(iG)
# This output means: this is an IGRAPH graph, with D = directed edges and N =
# named nodes, that has 20 nodes and 40 edges. For details, see
?print.igraph
mode(iG)
class(iG)
# This means an igraph graph object is a special list object; it is opaque in
# the sense that a user is never expected to modify its components directly, but
# through a variety of helper functions which the package provides. There are
# many ways to construct graphs - from adjacency matrices, as we have just done,
# from edge lists, or by producing random graphs according to a variety of
# recipes, called _games_ in this package.
# Two basic functions retrieve nodes "Vertices", and "Edges":
V(iG)
E(iG)
# As with many R objects, loading the package provides special functions that
# can be accessed via the same name as the basic R functions, for example:
print(iG)
plot(iG)
# ... where plot() allows the usual flexibility of fine-tuning the plot. We
# first layout the node coordinates with the Fruchtermann-Reingold algorithm - a
# force-directed layout that applies an ettractive potential along edges (which
# pulls nodes together) and a repulsive potential to nodes (so they don't
# overlap). Note the use of the degree() function to color and scale nodes and
# labels by degree and the use of the V() function to retrieve the vertex names.
# See ?plot.igraph for details."
iGxy <- layout_with_fr(iG) # calculate layout coordinates
# Plot with some customizing parameters
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(iG,
layout = iGxy,
vertex.color=heat.colors(max(degree(iG)+1))[degree(iG)+1],
vertex.size = 9 + (2 * degree(iG)),
vertex.label.cex = 0.5 + (0.05 * degree(iG)),
edge.arrow.size = 0,
edge.width = 2,
vertex.label = toupper(V(iG)$name))
par(oPar)
# == Components
# The igraph function components() tells us whether there are components of the
# graph in which there is no path to other components.
components(iG)
# In the _membership_ vector, nodes are annotatd with the index of the component
# they are part of. Sui7 is the only node of component 2, Cyj1 is in the third
# component etc. This is perhaps more clear if we sort by component index
sort(components(iG)$membership)
# Retrieving e.g. the members of the first component from the list can be done by subsetting:
components(iG)$membership == 1 # logical ..
components(iG)$membership[components(iG)$membership == 1]
names(components(iG)$membership)[components(iG)$membership == 1]
# == RANDOM GRAPHS AND GRAPH METRICS =================================
# Let's explore some of the more interesting, topological graph measures. We
# start by building a somewhat bigger graph. We aren't quite sure whether
# biological graphs are small-world, or random-geometric, or
# preferential-attachment ... but igraph has ways to simulate the basic ones
# (and we could easily simulate our own). Look at the following help pages:
?sample_gnm # see also sample_gnp for the Erdös-Rényi models
?sample_smallworld # for the Watts & Strogatz model
?sample_pa # for the Barabasi-Albert model
# But note that there are many more sample_ functions. Check out the docs!
# Let's look at betweenness measures for our first graph: here: the nodes again
# colored by degree. Degree centrality states: nodes of higher degree are
# considered to be more central. And that's also the way the force-directed
# layout drawas them, obviously.
set.seed(112358)
iGxy <- layout_with_fr(iG) # calculate layout coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(iG,
layout = iGxy,
rescale = FALSE,
xlim = c(min(iGxy[,1]), max(iGxy[,1])) * 1.1,
ylim = c(min(iGxy[,2]), max(iGxy[,2])) * 1.1,
vertex.color=heat.colors(max(degree(iG)+1))[degree(iG)+1],
vertex.size = 20 + (10 * degree(iG)),
vertex.label = Nnames,
edge.arrow.size = 0)
par(oPar)
# == Diameter
diameter(iG) # The diameter of a graph is its maximum length shortest path.
# let's plot this path: here are the nodes ...
get_diameter(iG)
# ... and we can get the x, y coordinates from iGxy by subsetting with the node
# names. The we draw the diameter-path with a transparent, thick pink line:
lines(iGxy[get_diameter(iG),], lwd=10, col="#ff63a788")
# == Centralization scores
?centralize
# replot our graph, and color by log_betweenness:
bC <- centr_betw(iG) # calculate betweenness centrality
nodeBetw <- bC$res
nodeBetw <- round(log(nodeBetw +1)) + 1
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(iG,
layout = iGxy,
rescale = FALSE,
xlim = c(min(iGxy[,1]), max(iGxy[,1])) * 1.1,
ylim = c(min(iGxy[,2]), max(iGxy[,2])) * 1.1,
vertex.color=heat.colors(max(nodeBetw))[nodeBetw],
vertex.size = 20 + (10 * degree(iG)),
vertex.label = Nnames,
edge.arrow.size = 0)
par(oPar)
# Note that the betweenness - the number of shortest paths that pass through a
# node, is in general higher for high-degree nodes - but not always: Eqr2 has
# higher betweenness than Itv7: this measure really depends on the detailed
# local topology of the graph."
# Can you use centr_eigen() and centr_degree() to calculate the respective
# values? That's something I would expect you to be able to do.
#
# Lets plot betweenness centrality for our random geometric graph:
bCiGRG <- centr_betw(iGRG) # calculate betweenness centrality
nodeBetw <- bCiGRG$res
nodeBetw <- round((log(nodeBetw +1))^2.5) + 1
# colours and size proportional to betweenness
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(iGRG,
layout = iGRGxy,
rescale = FALSE,
xlim = c(min(iGRGxy[,1]), max(iGRGxy[,1])),
ylim = c(min(iGRGxy[,2]), max(iGRGxy[,2])),
vertex.color=heat.colors(max(nodeBetw))[nodeBetw],
vertex.size = 0.1 + (0.03 * nodeBetw),
vertex.label = "",
edge.arrow.size = 0)
par(oPar)
diameter(iGRG)
lines(iGRGxy[get_diameter(iGRG),], lwd=10, col="#ff335533")
# == CLUSTERING ======================================================
# Clustering finds "communities" in graphs - and depending what the edges
# represent, these could be complexes, pathways, biological systems or similar.
# There are many graph-clustering algorithms. One approach with many attractive
# properties is the Map Equation, developed by Martin Rosvall. See:
# http://www.ncbi.nlm.nih.gov/pubmed/18216267 and htttp://www.mapequation.org
iGRGclusters <- cluster_infomap(iGRG)
modularity(iGRGclusters) # ... measures how separated the different membership
# types are from each other
membership(iGRGclusters) # which nodes are in what cluster?
table(membership(iGRGclusters)) # how large are the clusters?
# The largest cluster has 48 members, the second largest has 25, etc.
# Lets plot our graph again, coloring the nodes of the first five communities by
# their cluster membership:
# first, make a vector with as many grey colors as we have communities ...
commColors <- rep("#f1eef6", max(membership(iGRGclusters)))
# ... then overwrite the first five with "real colors" - something like rust,
# lilac, pink, and mauve or so.
commColors[1:5] <- c("#980043", "#dd1c77", "#df65b0", "#c994c7", "#d4b9da")
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(iGRG,
layout = iGRGxy,
rescale = FALSE,
xlim = c(min(iGRGxy[,1]), max(iGRGxy[,1])),
ylim = c(min(iGRGxy[,2]), max(iGRGxy[,2])),
vertex.color=commColors[membership(iGRGclusters)],
vertex.size = 0.1 + (0.1 * degree(iGRG)),
vertex.label = "",
edge.arrow.size = 0)
par(oPar)
# = 1 Tasks
# [END]

307
FND-STA-Significance.R Normal file
View File

@ -0,0 +1,307 @@
# FND-STA-Significance.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Significance unit.
#
# Version: 1.0
#
# Date: 2017 09 05
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.0 First contents
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
#
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------------
#TOC> 1 Significance and p-value 46
#TOC> 1.1 Significance levels 57
#TOC> 1.2 probability and p-value 74
#TOC> 1.2.1 p-value illustrated 100
#TOC> 2 One- or two-sided 152
#TOC> 3 Significance by integration 192
#TOC> 4 Significance by simulation or permutation 198
#TOC> 5 Final tasks 298
#TOC>
#TOC> ==========================================================================
#TOC>
#TOC>
# = 1 Significance and p-value ============================================
# The idea of the probability of an event has a precise mathematical
# interpretation, but how is it useful to know the probability? Usually we are
# interested in whether we should accept or reject a hypothesis based on the
# observations we have. A rational way to do this is to say: if the probability
# of observing the data is very small under the null-hypothesis, then we will
# assume the observation is due to something other than the null-hypothesis. But
# what do we mean by the "probability of our observation"? And what is "very
# small"?
# == 1.1 Significance levels ===============================================
# A "very small" probability is purely a matter of convention, a cultural
# convention. In the biomedical field we usually call probabilities of less then
# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
# observations with a probability of less than 0.05 "significant" and if we want
# to highlight this in text or in a graph, we often mark them with an asterisk
# (*). Also we often call observations with a probability of less than 0.01
# "highly significant" and mark them with two asterisks (**). But there is no
# special significance in these numbers, the cutoff point for significance could
# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
# British statistician Ronald Fisher happened to propose for this purpose in
# 1925. Incidentally, Fisher later recommended to use different cutoffs for
# diffrent purposes (cf.
# https://en.wikipedia.org/wiki/Statistical_significance).
# == 1.2 probability and p-value ===========================================
# But what do we even mean by the probability of an observation?
# Assume I am drawing samples from a normal distribution with a mean of 0 and a
# standard deviation of 1. The sample I get is ...
set.seed(sqrt(5))
x <- rnorm(1)
print(x, digits = 22)
# [1] -0.8969145466249813791748
# So what's the probability of that number? Obviously, the probability of
# getting exactly this number is very, very, very small. But also obviously,
# this does not mean that observing this number is in any way significant - we
# always observe some number. That's not what we mean in this case. There are
# several implicit assumptions when we speak of the probability of an
# observation:
# 1: the observation can be compared to a probability distribution;
# 2: that distribution can be integrated between any specific value
# and its upper and lower bounds (or +- infinity).
# Then what we really mean by the probability of an observation in the context of that distribution is: the
# probability of observing that value, or a value more extreme than the one we have. We call this the p-value. Note that we are not talking about an individual number anymore, we are talking about the area under the curve between our observation and the upper (or lower) bound of the curve, as a fraction of the whole.
# === 1.2.1 p-value illustrated
# Let's illustrate. First we draw a million random values from our
# standard, normal distribution:
r <- rnorm(1000000)
# Let's see what the distribution looks like:
(h <- hist(r))
# The histogram details are now available in the list h - e.g. h$counts
# Where is the value we have drawn previously?
abline(v = x, col = "#EE0000")
# How many values are smaller?
sum(r < x)
# Let's color the bars:
# first, make a vector of red and green colors for the bars with breaks
# smaller and larger then x, white for the bar that contains x ...
hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
hCol <- c(hCol, "#FFFFFFFF")
hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
# ... then plot the histogram, with colored bars ...
hist(r, col = hCol)
# ... add two colored rectangles into the white bar ...
idx <- sum(h$breaks < x)
xMin <- h$breaks[idx]
xMax <- h$breaks[idx + 1]
y <- h$counts[idx]
rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
# ... and a red line for our observation.
abline(v = x, col = "#EE0000", lwd = 2)
# The p-value of our observation is the area colored red as a fraction of the
# whole histogram (red + green).
# Task:
# Explain how the expression sum(r < x) works to give us a count of values
# with the property we are looking for.
# Task:
# Write an expression to estimate the probability that a value
# drawn from the vector r is less-or-equal to x. The result you get
# will depend on the exact values that went into the vector r but it should
# be close to 0.185 That expression is the p-value associated with x.
# = 2 One- or two-sided ===================================================
# The shape of our histogram confirms that the rnorm() function has returned
# values that appear distributed according to a normal distribution. In a normal
# distribution, readily available tables tell us that 5% of the values (i.e. our
# significance level) lie 1.96 (or approximately 2) standard deviations away
# from the mean. Is this the case here? How many values in our vector r are
# larger than 1.96?
sum(r > 1.96)
# [1] 24589
# Wait - that's about 2.5% , not 5% as expected. Why?
# The answer is: we have to be careful with two-sided distributions. 2 standard
# deviations away from the mean means either larger or smaller than 1.96 . This
# can give rise to errors. If we are simply are interested in outliers, no
# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
# But if we are specifically interested in, say, larger values, because a
# smaller value is not meaningful, then the significance cutoff, expressed as
# standard deviations, is relaxed. We can use the quantile function to see what
# the cutoff values are:
quantile(r)
quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
# close to ± 1.96, as expected
quantile(r, probs = 0.95) # for the single 5% boundary
# close to 1.64 . Check counts to confirm:
sum(r > quantile(r, probs = 0.95))
# [1] 50000
# which is 5%, as expected.
# To summarize: when we evaluate the significance of an event, we divide a
# probability distribution into two parts at the point where the event was
# observed. We then ask whether the integral over the more extreme part is less
# or more than 5% of the whole. If it is less, we deem the event to be
# significant.
#
# = 3 Significance by integration =========================================
# If the underlying probability distribution can be analytically or numerically
# integrated, the siginificance of an observation can be directly computed.
# = 4 Significance by simulation or permutation ===========================
# But whether the integration is correct, or relies on assumptions that may not
# be warranted for biological data, can be a highly technical question.
# Fortunately, we can often simply run a simulation, a random resampling, or a
# permutation and then count the number of outcomes, just as we did with our
# rnorm() samples. Here is an example. Assume you have a protein sequence and
# you speculate that postively charged residues are close to negatively charged
# residues to balance charge locally. A statistic that would capture this is the
# mean minimum distance between all D,E residues and the closest R,K,H
# residue. Let's compute this for the sequence of yeast Mbp1.
MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
"ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
"SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
"KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
"QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
"PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
"FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
"IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
"SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
"ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
"VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
"IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
"QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
"IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
# first we split this string into individual characters:
v <- unlist(strsplit(MBP1, ""))
# and find the positions of our charged residues
ED <- grep("[ED]", v)
RKH <- grep("[RKH]", v)
sep <- numeric(length(ED)) # this vector will hold the distances
for (i in seq_along(ED)) {
sep[i] <- min(abs(RKH - ED[i]))
}
# Task: read and explain this bit of code
# Now that sep is computed, what does it look like?
table(sep) # these are the minimum distances
# 24 of D,E residues are adjacent to R,K,H;
# the longest separation is 28 residues.
# What is the mean separation?
mean(sep)
# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
# to solve this analytically. But by permutation it's soooo easy.
# First, we combine what we have done above into a function:
chSep <- function(v) {
# computes the mean minimum separation of oppositely charged residues
# Parameter: v (char) a vector of amino acids in the one-letter code
# Value: msep (numeric) mean minimum separation
ED <- grep("[EDed]", v)
RKH <- grep("[RKHrkh]", v)
sep <- numeric(length(ED))
for (i in seq_along(ED)) {
sep[i] <- min(abs(RKH - ED[i]))
}
return(mean(sep))
}
# Execute the function to define it.
# Confirm that the function gives the same result as the number we
# calculated above:
chSep(v)
# Now we can produce a random permutation of v, and recalculate
set.seed(pi)
w <- sample(v, length(v)) # This shuffles the vector v. Memorize this
# code paradigm. It is very useful.
chSep(w)
# 3.273 ... that's actually less than what we had before.
# Let's do this 10000 times and record the results (takes a few seconds):
N <- 10000
chs <- numeric(N)
for (i in 1:N) {
chs[i] <- chSep(sample(v, length(v))) # charge
}
hist(chs)
abline(v = chSep(v), col = "#EE0000")
# Contrary to our expectations, the actual observed mean minimum charge
# separation seems to be larger than what we observe in randomly permuted
# sequences. But is this significant? Your task to find out.
# = 5 Final tasks =========================================================
# From chs, compute the probability of a mean minimum charge separation to be
# larger or equal to the value observed for the yeast MBP1 sequence. Note
# the result in your journal. Is it significant? Also note the result of
# the following expression for validation:
writeBin(sum(chs), raw(8))
# [END]

51
RPR-Biostrings.R Normal file
View File

@ -0,0 +1,51 @@
# RPR-Biostrings.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Biostrings unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 The Biostrings package
# First, we install and load the Biostrings package.
if (!require(Biostrings, quietly=TRUE)) {
source("https://bioconductor.org/biocLite.R")
biocLite("Biostrings")
library(Biostrings)
}
# This is a large collection of tools ...
help(package = "Biostrings")
# = 1.1 <<<Subsection>>>
# = 1 Tasks
# [END]

84
RPR-Introduction.R Normal file
View File

@ -0,0 +1,84 @@
# RPR-Introduction.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Introduction unit
#
# Version: 0.1
#
# Date: 2017 08 25
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 0.1 First code
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
# === TASK: Local script
#
# - Open the file myScript.R
# - Update the header of the script to emulate the header of the file you are
# reading right now. Don't bother giving the myScript.R file version numbers
# though.
# - Create a section header with a date.
# - Enter an R-expression that will produce the first 21 powers of 2 (starting
# from 0). Not a loop - a single expression. The first number you get must
# be 1. The last number you get must be 1024.
# - Save the file.
#
#
# ==============================================================================
" Introduction:
...
"
# ==============================================================================
# PART ONE: REVIEW
# ==============================================================================
# == SECTION ===================================================================
# == Subsection
# Continue ...
# ==============================================================================
# APPENDIX: OUTLOOK
# ==============================================================================
"There are many more functions for ... that this tutorial did not cover. You should know about the following. Look up the function and write a short bit of example code that uses it:"
?subset
?sweep
?with # ... and within()
"Then you should know about the following packages. Open the vignette and browse through it. You should know be able to come up with least one use-case where the package functions would be useful:
https://cran.r-project.org/web/packages/magrittr/
"
# [END]

67
RPR-RegEx.R Normal file
View File

@ -0,0 +1,67 @@
# RPR-RegEx.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-RegEx unit
#
# Version: 0.1
#
# Date: 2017 08 25
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 0.1 First code
#
# TODO:
#
#
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
#
# DO NOT SIMPLY source() THESE FILES!
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
#
# ==============================================================================
# ==============================================================================
# PART ONE: REGEX EXAMPLE
# ==============================================================================
# The Mbp1 sequence as copied from the NCBI Website
mySeq <- "
1 msnqiysary sgvdvyefih stgsimkrkk ddwvnathil kaanfakakr trilekevlk
61 ethekvqggf gkyqgtwvpl niakqlaekf svydqlkplf dftqtdgsas pppapkhhha
121 skvdrkkair sastsaimet krnnkkaeen qfqsskilgn ptaaprkrgr pvgstrgsrr
181 klgvnlqrsq sdmgfprpai pnssisttql psirstmgpq sptlgileee rhdsrqqqpq
241 qnnsaqfkei dledglssdv epsqqlqqvf nqntgfvpqq qssliqtqqt esmatsvsss
301 pslptspgdf adsnpfeerf pgggtspiis miprypvtsr pqtsdindkv nkylsklvdy
361 fisnemksnk slpqvllhpp phsapyidap idpelhtafh wacsmgnlpi aealyeagts
421 irstnsqgqt plmrsslfhn sytrrtfpri fqllhetvfd idsqsqtvih hivkrksttp
481 savyyldvvl skikdfspqy rielllntqd kngdtalhia skngdvvffn tlvkmgaltt
541 isnkegltan eimnqqyeqm miqngtnqhv nssntdlnih vntnnietkn dvnsmvimsp
601 vspsdyityp sqiatnisrn ipnvvnsmkq masiyndlhe qhdneikslq ktlksisktk
661 iqvslktlev lkesskdeng eaqtnddfei lsrlqeqntk klrkrliryk rlikqkleyr
721 qtvllnklie detqattnnt vekdnntler lelaqeltml qlqrknklss lvkkfednak
781 ihkyrriire gtemnieevd ssldvilqtl iannnknkga eqiitisnan sha
//
"
mySeq # "\n" means: line-break
mySeq <- gsub("[^a-zA-Z]", "", mySeq) # replace all non-letters with ""
mySeq
# Now to change the sequence to upper-case. R has toupper()
# and tolower().
toupper(mySeq)
# CONTINUE ...
# [END]

717
RPR-SX-PDB.R Normal file
View File

@ -0,0 +1,717 @@
# RPR-SX-PDB.R
#
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-SX-PDB unit.
#
# Version: 0.1
#
# Date: 2017 08 28
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.1 First code copied from 2016 material.
#
# TODO:
#
#
# == DO NOT SIMPLY source() THIS FILE! =======================================
# If there are portions you don't understand, use R's help system, Google for an
# answer, or ask your instructor. Don't continue if you don't understand what's
# going on. That's not how it works ...
# ==============================================================================
# = 1 ___Section___
# In this example of protein structure interpretation, we ...
# - load the library "bio3D" which supports work with
# protein structure files,
# - explore some elementary functions of the library
# - assemble a script to see whether H-bond lengths systematically differ
# between alpha-helical and beta-sheet structures.
if(!require(bio3d)) {
install.packages("bio3d", dependencies=TRUE)
library(bio3d)
}
lbio3d() # ... lists the newly installed functions,
# they all have help files associated.
# More information is available in the so-called
# "vignettes" that are distributed with most R packages:
vignette("bio3d_vignettes")
# bio3d can load molecules directly from the PDB servers, you don't _have_ to
# store them locally, but you could.
#
# But before you _load_ a file, have a look what such a file contains. I have packaged the 1BM8 file into the project:
file.show("./assets/1BM8.pdb")
# Have a look at the header section, the atom records, the coordinate records
# etc. Answer the following questions:
#
# What is the resolution of the structure?
# Is the first residue in the SEQRES section also the first residue
# with an ATOM record?
# How many water molecules does the structure contain?
# Can you explain REMARK 525 regarding HOH 459 and HOH 473?
# Are all atoms of the N-terminal residue present?
# Are all atoms of the C-terminal residue present?
apses <- read.pdb("1bm8") # load a molecule directly from PDB
# check what we have:
apses
# what is this actually?
str(apses)
# bio3d's pdb objects are simple lists. Great! You know lists!
# You see that there is a table called atom in which the elements are vectors of the same length - namely the number of atoms in the structure file. And there is a matrix of (x, y, z) triplets called xyz. And there is a vector that holds sequence, and two tables called helix and sheet that look a lot like our feature annotation tables - in fact many of the principles to store this strutcure data are similar to how we constructed our protein database. Let's pull out a few values to confirm how selection and subsetting works here:
# selection by atom ...
i <- 5
apses$atom[i,]
apses$atom[i, c("x", "y", "z")] # here we are selecting with column names!
apses$xyz[c(i*3-2, i*3-1, i*3)] # here we are selcting with row numbers
# all atoms of a residue ...
i <- "48" #string!
apses$atom[apses$atom[,"resno"] == i, ]
# sequence of the first ten residues
apses$seqres[1:10] # the As here identify the chain
# Can we convert this to one letter code?
aa321(apses$seqres[1:10])
# Lets get the implicit sequence:
aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code?
# Do explicit and implicit sequence have the same length?
length(apses$seqres)
length(apses$atom$resid[apses$calpha])
# Are the sequences the same?
sum(apses$seqres == apses$atom$resid[apses$calpha])
# get a list of all CA atoms of arginine residues
sel <- apses$atom$resid == "ARG" & apses$atom$elety == "CA"
apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")]
# The introduction to bio3d tutorial at
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
# has the following example:
plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
# Good for now. Let's do some real work.
# ==============================================================================
# PART TWO: A Ramachandran plot
# ==============================================================================
# Calculate a Ramachandran plot for the structure
tor <- torsion.pdb(apses)
plot(tor$phi, tor$psi)
# As you can see, there are a number of points in the upper-right
# quadrant of the plot. This combination of phi-psi angles defines
# the conformation of a left-handed alpha helix and is generally
# only observed for glycine residues. Let's replot the data, but
# color the points for glycine residues differently. First, we
# get a vector of glycine residue indices in the structure:
sSeq <- pdbseq(apses)
# Explore the result object and extract the indices of GLY resiues.
sSeq == "G"
which(sSeq == "G")
as.numeric(which(sSeq == "G"))
iGly <- as.numeric(which(sSeq == "G"))
# Now plot all non-gly residues.
# Remember: negative indices exclude items from a vector
plot(tor$phi[-iGly], tor$psi[-iGly], xlim=c(-180,180), ylim=c(-180,180))
# Now plot GLY only, but with green dots:
points(tor$phi[iGly], tor$psi[iGly], pch=21, cex=0.9, bg="#00CC00")
# As you see, four residues in the upper-right quadrant are
# not glycine. But what residues are these? Is there an
# error in our script? Let's get their coordinate records:
iOutliers <- which(tor$phi > 30 & tor$phi < 90 &
tor$psi > 0 & tor$psi < 90)
CA <- apses$atom[apses$calpha, c("eleno", "elety", "resid", "chain", "resno")]
dat <- cbind(CA[iOutliers, ], phi = tor$phi[iOutliers], psi = tor$psi[iOutliers])
dat
# remove the glycine from our table ...
dat <- dat[dat$resid != "GLY", ]
dat
# let's add the residue numbers to the plot with the text function:
for (i in 1:nrow(dat)) {
points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000")
text(dat$phi[i],
dat$psi[i],
labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]),
pos = 4,
offset = 0.4,
cex = 0.7)
}
# You can check the residues in Chimera. Is there anything unusual?
# Are there any cis-peptide bonds in the structure?
tor$omega
#... gives us a quick answer. But wait - what values do we
# expect? And why are the values so different?
# Consider this plot: what am I doing here and why?
x <- tor$omega[tor$omega > 0]
x <- c(x, 360 + tor$omega[tor$omega < 0])
hist(x, xlim=c(90,270))
abline(v=180, col="red")
# ==============================================================================
# PART THREE: H-bond lengths
# ==============================================================================
# Let's do something a little less trivial and compare
# backbone H-bond lengths between helices and strands.
#
# Secondary structure is defined in the list components ...$helix
# and ...$strand.
# We need to
# - collect all residue indices for alpha helices resp.
# beta strands;
# - fetch the atom coordinates;
# - calculate all N, O distances using dist.xyz();
# - filter them for distances that indicate H-bonds; and,
# - plot the results.
# Secondary structure can either be obtained from
# definitions contained in many PDB files, or by running
# the DSSP algorithm IF(!) you have it installed on your
# machine. The 1BM8 file contains definitions
apses$helix
apses$sheet
# (1): collect the residue numbers
# between the segment boundaries.
H <- numeric() # This will contain the helix residue numbers
for (i in 1:length(apses$helix)) {
H <- c(H, apses$helix$start[i]:apses$helix$end[i])
}
# Doing the same for the sheet residue numbers requires
# very similar code. Rather than retype the code, it is
# better to write a function that can do both.
getSecondary <- function(sec) {
iRes <- c()
for (i in 1:length(sec$start)) {
iRes <- c(iRes, sec$start[i]:sec$end[i])
}
return(iRes)
}
# Compare ...
H
getSecondary(apses$helix)
# ... and use for strands
E <- getSecondary(apses$sheet)
# Now here's a problem: these numbers refer to the
# residue numbers as defined in the atom records. They
# can't be used directly to address e.g. the first, second
# third residue etc. since the first residue has the
# residue number 4...
apses$atom[1,]
# Therefore we need to
# 1: convert the numbers to strings;
# 2: subset the atom table for rows contain these strings.
#
# Essentially, we don't treat the "residue numbers" as numbers,
# but as labels. That's fine, as long as they are unique.
# (2): fetch coordinates of N and O atoms
# for residues in alpha- and beta- conformation.
# For one residue, the procedure goes as follows:
res <- H[17] # pick an arbitrary alpha-helix residue to illustrate
res
res <- as.character(res)
res
# all atom rows for this residue
apses$atom[apses$atom[,"resno"] == res, ]
# add condition: row with "N" atom only
apses$atom[apses$atom[,"resno"] == res &
apses$atom[,"elety"] == "N", ]
# add column selection: "x", "y", "z"
apses$atom[apses$atom[,"resno"] == res &
apses$atom[,"elety"] == "N",
c("x", "y", "z")]
# convert to numbers
as.numeric (
apses$atom[apses$atom[,"resno"] == res &
apses$atom[,"elety"] == "N",
c("x", "y", "z")]
)
# Now we need to add this into a matrix as we iterate over the desired residues.
# We need to execute the procedure four times for alpha and beta Ns and Os
# respectively. Rather than duplicate the code four times over, we write a
# function. Never duplicate code, because if you need to make changes it is too
# easy to forget making the change consistently in all copies.
getAtom <- function(PDB, r, AT) {
# Function to iterate over residue number strings and
# return a matrix of x, y, z triplets for each atom
# of a requested type.
mat <- c() #initialize as empty matrix
for (i in 1:length(r)) {
res <- as.character(r[i])
v <- as.numeric (
PDB$atom[PDB$atom[,"resno"] == res &
PDB$atom[,"elety"] == AT,
c("x", "y", "z")]
)
mat <- rbind(mat, v)
}
return(mat)
}
# Now run the functions with the parameters we need...
H.xyz.N <- getAtom(apses, H, "N") # backbone N atoms in helix
H.xyz.O <- getAtom(apses, H, "O") # backbone O atoms in helix
E.xyz.N <- getAtom(apses, E, "N") # backbone N atoms in strand
E.xyz.O <- getAtom(apses, E, "O") # backbone O atoms in strand
# (3): calculate distances between N and O atoms to find H-bonds.
# We spent most of our effort so far just preparing our raw data for analysis.
# Now we can actually start measuring. bio3d provides the function dist.xyz() -
# but lets agree it builds character to code this ourselves.
# Consider the distance of the first (N,O) pair.
H.xyz.N[1,]
H.xyz.O[1,]
a <- H.xyz.N[1,]
b <- H.xyz.O[1,]
dist.xyz(a, b)
# or ...
sqrt( (a[1]-b[1])^2 + (a[2]-b[2])^2 + (a[3]-b[3])^2 )
# ... i.e. pythagoras theorem in 3D.
# Calculating all pairwise distances from a matrix of
# xyz coordinates sounds like a useful function.
PairDist.xyz <- function(xyzA, xyzB) {
PD <- c()
for (i in 1:nrow(xyzA)) {
for (j in 1:nrow(xyzB)) {
PD <- c(PD, dist.xyz(xyzA[i,], xyzB[j,]))
}
}
return(PD)
}
# And see what we get:
D <- PairDist.xyz(H.xyz.N, H.xyz.O)
hist(D)
# let's zoom in on the shorter distances, in which we expect
# hydrogen bonds:
hist(D[D < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0))
# There is a large peak at about 2.2Å, and another
# large peak above 3.5Å. But these are not typical hydrogen
# bond distances! Rather these are (N,O) pairs in peptide
# bonds, and within residues. That's not good, it will
# cause all sorts of problems with statistics later.
# We should exclude all distances between N of a residue
# and O of a preceeding residue, and all (N,O) pairs in the
# same residue. But for this, we need to store atom type
# and residue information with the coordinates. Our code
# will get a bit bulkier. It's often hard to find a good
# balance between terse generic code, and code that
# handles special cases.
# We could do two things:
# - add a column with residue information to the coordinates
# - add a column with atom type information
# ... but actually we also would need chain information, and
# then we really have almost everything that is contained in the record
# in the first place.
# This suggests a different strategy: let's rewrite our function
# getAtom() to return indices, not coordinates, and use the indices
# to extract coordinates, and THEN we can add all sorts of
# additional constraints.
# Here we set up the function with a default chain argument
getAtomIndex <- function(PDB, V_res, elety, chain="A") {
# Function to access a bio3d pdb object, iterate over
# a vector of residues and return a vector of indices
# to matching atom elements. Nb. bio3d handles insert
# and alt fields incorrectly: their values should not
# be NA but " ", i.e. a single blank. Therefore this
# function does not test for "alt" and "insert".
v <- c() #initialize as empty vector
for (i in 1:length(V_res)) {
res <- as.character(V_res[i])
x <- which(PDB$atom[,"resno"] == res &
PDB$atom[,"chain"] == chain &
PDB$atom[,"elety"] == elety)
v <- c(v, x)
}
return(v)
}
# test this ...
getAtomIndex(apses, H, "N")
getAtomIndex(apses, H, "O")
# That looks correct: O atoms should be stored three
# rows further down: the sequence of atoms in a PDB
# file is usually N, CA, C, O ... followed by the side
# chain coordinates.
# Now to extract the coordinates and calculate distances.
# Our function needs to take the PDB object and
# two vectors of atom indices as argument, and return
# a vector of pair-distances.
PairDist <- function(PDB, a, b) {
PD <- c()
for (i in 1:length(a)) {
p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")])
for (j in 1:length(b)) {
q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")])
PD <- c(PD, dist.xyz(p, q))
}
}
return(PD)
}
# Let's see if this looks correct:
H.N <- getAtomIndex(apses, H, "N")
H.O <- getAtomIndex(apses, H, "O")
X <- PairDist(apses, H.N, H.O)
hist(X[X < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0))
# Now we are back where we started out from, but with
# a different logic of the function that we can easily
# modify to exclude (N_i, O_i-1) distances (peptide bond),
# and (N_i, O_i) distances (within residue).
HB <- function(PDB, a, b) {
HBcutoff <- 4.0
PD <- c()
for (i in 1:length(a)) {
p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")])
res_i <- as.numeric(PDB$atom[a[i], "resno"])
for (j in 1:length(b)) {
q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")])
res_j <- as.numeric(PDB$atom[a[j], "resno"])
if (res_i != res_j+1 &
res_i != res_j ) {
d <- dist.xyz(p, q)
if (d < HBcutoff) {
PD <- c(PD, d)
}
}
}
}
return(PD)
}
# test this:
X <- HB(apses, H.N, H.O)
hist(X)
# ... and this looks much more like the distribution we are
# seeking.
# Why did we go along this detour for coding the
# function? The point is that there are usually
# several ways to use or define datastructures and
# functions. Which one is the best way may not be
# obvious until you understand the problem better.
# At first, we wrote a very generic function that
# extracts atom coordinates to be able to compute
# with them. This is simple and elegant. But we
# recognized limitations in that we could not
# make more sophisticated selections that we needed
# to reflect our biological idea of hydrogen
# bonds. Thus we changed our datastructure
# and functions to accomodate our new requirements
# better. You have to be flexible and able to look
# at a task from different angles to succeed.
# Finally we can calculate alpha- and beta- structure
# bonds and compare them. In this section we'll explore
# different options for histogram plots.
H.N <- getAtomIndex(apses, H, "N")
H.O <- getAtomIndex(apses, H, "O")
dH <- HB(apses, H.N, H.O)
E.N <- getAtomIndex(apses, E, "N")
E.O <- getAtomIndex(apses, E, "O")
dE <- HB(apses, E.N, E.O)
# The plain histogram functions without parameters
# give us white stacks.
hist(dH)
# and ...
hist(dE)
# We can see that the histrograms look different
# but that is better visualized by showing two plots
# in the same window. We use the par() function, for
# more flexible layout, look up the layout() function.
?par
?layout
opar <- par(no.readonly=TRUE) # store current state
par(mfrow=c(2,1)) # set graphics parameters: 2 rows, one column
# plot two histograms
hist(dH)
hist(dE)
# add color:
hist(dH, col="#DD0055")
hist(dE, col="#00AA70")
# For better comparison, plot both in the
# same window:
hist(dH, col="#DD0055")
hist(dE, col="#00AA70", add=TRUE)
# ... oops, we dind't reset the graphics parameters.
# You can either close the window, a new window
# will open with default parameters, or ...
par(opar) # ... reset the graphics parameters
hist(dH, col="#DD0055")
hist(dE, col="#00AA70", add=TRUE)
# We see that the leftmost column of the sheet bonds
# overlaps the helix bonds. Not good. But we
# can make the colors transparent! We just need to
# add a fourth set of two hexadecimal-numbers to
# the #RRGGBB triplet. Lets use 2/3 transparent,
# in hexadecimal, 1/3 of 256 is x55 - i.e. an
# RGB triplet specied as #RRGGBB55 is only 33%
# opaque:
hist(dH, col="#DD005555")
hist(dE, col="#00AA7055", add=TRUE)
# To finalize the plots, let's do two more things:
# Explicitly define the breaks, to make sure they
# match up - otherwise they would not need to...
# see for example:
hist(dH, col="#DD005555")
hist(dE[dE < 3], col="#00AA7055", add=TRUE)
# Breaks are a parameter in hist() that can
# either be a scalar, to define how many columns
# you want, or a vector, that defines the actual
# breakpoints.
brk=seq(2.4, 4.0, 0.1)
hist(dH, col="#DD005555", breaks=brk)
hist(dE, col="#00AA7055", breaks=brk, add=TRUE)
# The last thing to do is to think about rescaling the plot.
# You notice that the y-axis is scaled in absolute frequency.
# That gives us some impression of the relative frequency,
# but it is of course skewed by observing relatively more
# or less of one type of secondary structure in a protein.
# As part of the hist() function we can rescale the values so
# that the sum over all is one: set the prameter freq=FALSE.
hist(dH, col="#DD005555", breaks=brk, freq=FALSE)
hist(dE, col="#00AA7055", breaks=brk, freq=FALSE, add=TRUE)
# Adding labels and legend ...
hH <- hist(dH,
freq=FALSE,
breaks=brk,
col="#DD005550",
xlab="(N,O) distance (Å)",
ylab="Density",
ylim=c(0,4),
main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
freq=FALSE,
breaks=brk,
col="#00AA7060",
add=TRUE)
legend("topright",
c(sprintf("alpha (N = %3d)", sum(hH$counts)),
sprintf("beta (N = %3d)", sum(hE$counts))),
fill = c("#DD005550", "#00AA7060"), bty = 'n',
border = NA)
# ===========================================================
# With all the functions we have defined,
# it is easy to try this with a larger protein.
# 3ugj for example is VERY large. The calculation will take a few
# minutes:
pdb <- read.pdb("3ugj")
H <- getSecondary(pdb$helix)
E <- getSecondary(pdb$sheet)
H.N <- getAtomIndex(pdb, H, "N")
H.O <- getAtomIndex(pdb, H, "O")
dH <- HB(pdb, H.N, H.O)
E.N <- getAtomIndex(pdb, E, "N")
E.O <- getAtomIndex(pdb, E, "O")
dE <- HB(pdb, E.N, E.O)
brk=seq(2.4, 4.0, 0.1)
hH <- hist(dH,
freq=FALSE,
breaks=brk,
col="#DD005550",
xlab="(N,O) distance (Å)",
ylab="Density",
ylim=c(0,4),
main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
freq=FALSE,
breaks=brk,
col="#00AA7060",
add=TRUE)
legend('topright',
c(paste("alpha (N = ", sum(hH$counts), ")"),
paste("beta (N = ", sum(hE$counts), ")")),
fill = c("#DD005550", "#00AA7060"), bty = 'n',
border = NA,
inset = 0.1)
# It looks more and more that the distribution is
# indeed different. Our sample is large, but derives
# from a single protein.
# To do database scale statistics, we should look
# at many more proteins. To give you a sense of how,
# let's do this for just ten proteins, taken from
# the architecture level of the CATH database for
# mixed alpha-beta proteins (see:
# http://www.cathdb.info/browse/browse_hierarchy_tree):
PDBarchitectures <- c("3A4R", "A")
names(PDBarchitectures) <- c("ID", "chain")
PDBarchitectures <- rbind(PDBarchitectures, c("1EWF","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("2VXN","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1I3K","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1C0P","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("3QVP","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1J5U","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("2IMH","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("3NVS","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1UD9","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1XKN","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1OZN","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("2DKJ","A"))
dH <- c()
dE <- c()
for (i in 1:nrow(PDBarchitectures)) {
pdb <- read.pdb(PDBarchitectures[i,1])
chain <- PDBarchitectures[i,2]
H <- getSecondary(pdb$helix)
H.N <- getAtomIndex(pdb, H, "N", chain)
H.O <- getAtomIndex(pdb, H, "O", chain)
dH <- c(dH, HB(pdb, H.N, H.O))
E <- getSecondary(pdb$sheet)
E.N <- getAtomIndex(pdb, E, "N", chain)
E.O <- getAtomIndex(pdb, E, "O", chain)
dE <- c(dE, HB(pdb, E.N, E.O))
}
brk=seq(2.0, 4.0, 0.1)
hH <- hist(dH,
freq=FALSE,
breaks=brk,
col="#DD005550",
xlab="(N,O) distance (Å)",
ylab="Density",
ylim=c(0,4),
main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
freq=FALSE,
breaks=brk,
col="#00AA7060",
add=TRUE)
legend('topright',
c(paste("alpha (N = ", sum(hH$counts), ")"),
paste("beta (N = ", sum(hE$counts), ")")),
fill = c("#DD005550", "#00AA7060"), bty = 'n',
border = NA,
inset = 0.1)
# Why do you think these distributions are different?
# At what distance do you think H-bonds have the lowest energy?
# = 1 Tasks
# [END]