411 lines
17 KiB
R
411 lines
17 KiB
R
# create_refDB.R
|
|
# Create a reference protein database for Mbp1-like proteins
|
|
#
|
|
# Boris Steipe for BCH441
|
|
#
|
|
# For the species, see:
|
|
# cf. http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
|
#
|
|
# For the schema, see dbInit() in .utilities.R
|
|
#
|
|
# ==============================================================================
|
|
|
|
refDB <- dbInit()
|
|
|
|
|
|
# === protein table ===
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_ASPNI",
|
|
RefSeqID = "XP_660758",
|
|
UniProtID = "Q5B8H6",
|
|
taxonomy.ID = as.integer(162425),
|
|
sequence = dbSanitizeSequence("
|
|
MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKVQGGYGKYQGT
|
|
WIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDVFSAVNHHRSMGPPSFHHEHY
|
|
DVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVAAMSLSEQEHILYGDQLLDYFMTVGDAPEAT
|
|
RIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDLLRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLL
|
|
LDTISFRDWFGATLFHHIAQTTKSKGKWKSSRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDL
|
|
LLSRCPRAGDLVNKRGETASSIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIG
|
|
AIMAEASRKLTSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE
|
|
QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKFDVHRKLVALA
|
|
TGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_BIPOR",
|
|
RefSeqID = "XP_007682304",
|
|
UniProtID = "W6ZM86",
|
|
taxonomy.ID = as.integer(101162),
|
|
sequence = dbSanitizeSequence("
|
|
MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKVQGGYGKYQGT
|
|
WIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAAAVAAAAAAAAVANHNALMSN
|
|
SRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADGNRKRKRGMDQMSLLDQQHQIWADQLLDYFM
|
|
LLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDVGVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMP
|
|
SMVKIFQQTVHRTDWFGSTVFHHIAATTSSSNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGA
|
|
RKCVRSLLGRNVAVDIPNKKGETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVR
|
|
ESVQYRSQTASHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND
|
|
EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELVREVVGNLSVA
|
|
GMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_NEUCR",
|
|
RefSeqID = "XP_955821",
|
|
UniProtID = "Q7RW59",
|
|
taxonomy.ID = as.integer(5141),
|
|
sequence = dbSanitizeSequence("
|
|
MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREVQKDTHEKIQG
|
|
GYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAVPTWGSKSAKNANPPQPGTFL
|
|
PPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFSTGHRKRKRDELIEDMTEQQHAVYGDELLDY
|
|
FLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMGDVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQT
|
|
FPQVMKELFSTIDCRDLSGCTVIHHAAVMKIGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAM
|
|
RDARKCIRALLGRGASTDIPNKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQP
|
|
NYSSDAANTVQNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE
|
|
MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDENETESEAEHP
|
|
DPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEARGMLGTGERIDKYKHLLMSC
|
|
LPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGGGQPSNGRRESVLPALRGGNGDGEMSRRGSR
|
|
TAAAAAAQVDGEREINGRAGAERTERIQEIAAV"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_SACCE",
|
|
RefSeqID = "NP_010227",
|
|
UniProtID = "P39678",
|
|
taxonomy.ID = as.integer(4932),
|
|
sequence = dbSanitizeSequence("
|
|
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF
|
|
GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET
|
|
KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL
|
|
PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ
|
|
QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV
|
|
NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
|
|
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL
|
|
SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM
|
|
MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ
|
|
MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK
|
|
KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS
|
|
LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_SCHPO", # actually the Res2 protein
|
|
RefSeqID = "NP_593032",
|
|
UniProtID = "P41412",
|
|
taxonomy.ID = as.integer(4896),
|
|
sequence = dbSanitizeSequence("
|
|
MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQGGYGKYQGTW
|
|
VPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSSSTLHSVNEKQPNSSISPTIE
|
|
SSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDKYEESLLDFFLHPEEGRIPSFLYSPPPDFQV
|
|
NSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRLSQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQS
|
|
IFHHIVQSTSTPSKVAAAKYYLDCILEKLISIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNR
|
|
QRRTASEYLLEADKKPHSLLQSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLI
|
|
RANRLKQDTLNEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS
|
|
DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLIAMSCGINPED
|
|
LSLEILDAVEEALTREK"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_COPCI",
|
|
RefSeqID = "XP_001837394",
|
|
UniProtID = "A8NYC6",
|
|
taxonomy.ID = as.integer(5346),
|
|
sequence = dbSanitizeSequence("
|
|
MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGGYGKYQGTWIP
|
|
LERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVINTRSTRKQVADGVGEESDHDT
|
|
HSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRYRQSNDRYDEDDDASRHNGMGDPRSYGDQIL
|
|
EYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACAMGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDV
|
|
RKFPELYELLHRSTLNIDNSNRTVFHHVVDVAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARC
|
|
RSKRLVKLLIDHGADPKINNHDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLH
|
|
YSAAAQKASTRCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE
|
|
NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQVQQEEVSDLV
|
|
ELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAAGCGGLEPLEIDSVLGMLLET
|
|
LEAEDPSSTSATWSGSKGQQTG"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_CRYNE",
|
|
RefSeqID = "XP_569090",
|
|
UniProtID = "Q5KMQ9",
|
|
taxonomy.ID = as.integer(5207),
|
|
sequence = dbSanitizeSequence("
|
|
MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQG
|
|
GYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKEKETGRTKATPSRTGPTSAAA
|
|
LQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVMTDQDMEVDKMGMHMSMPNVTLSQNMEELGA
|
|
GSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLGIGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFD
|
|
PNAPIDDDGHTALHWACAMGRVRVVKLLLTAGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNR
|
|
TVFHHIANLALTKGKTHAAKYYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRD
|
|
SRSAEDYILEDERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE
|
|
RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVREQRWENGELE
|
|
GNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQIAKYRKLVSAGLGGVSTNEV
|
|
DELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_PUCGR",
|
|
RefSeqID = "XP_003327086",
|
|
UniProtID = "E3KED4",
|
|
taxonomy.ID = as.integer(5297),
|
|
sequence = dbSanitizeSequence("
|
|
MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETYHLLPSRSPPT
|
|
VSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREIQKGTHEKIQGGYGKYQG
|
|
TWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVKVSKVSAASAARAARAVVPSLPSTSGLGGRN
|
|
TNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSNLARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPEND
|
|
NSLTPSELSLPSRTPSPIEDLPLTVNTASSQSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPS
|
|
NAASAKYAKLILDYFVSESSQIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTAL
|
|
MRAVMFTNNHDLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE
|
|
DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPGGTSNRSDFVD
|
|
LVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERDLDHAVGLLSNIEKEYLEGQR
|
|
KILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERARRITEQRSKYLQELSIEDRKLLDSSNLRFAD
|
|
PSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRESVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSAN
|
|
TSRLNNYRKLISLGCGGIGLDEVDEVIESLNEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_USTMA",
|
|
RefSeqID = "XP_011392621",
|
|
UniProtID = "A0A0D1DP35",
|
|
taxonomy.ID = as.integer(5270),
|
|
sequence = dbSanitizeSequence("
|
|
MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQGGYGKYQGTWI
|
|
PLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRSRRATSIETESEVIGAAPNNV
|
|
SEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARYADIILDYFVTENTTVPSLLINPPPDFNPDM
|
|
SIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQTALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVF
|
|
HHVVDLALSRGKPHAARYYMETMINRLADYGDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKN
|
|
AEDYIIEDERFRSSPSRTGPAGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQA
|
|
HGLLKQIQTEIEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS
|
|
TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTMAAYRRLIAAG
|
|
CGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGAP"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
|
|
refDB$protein <-
|
|
rbind(refDB$protein,
|
|
data.frame(
|
|
ID = dbAutoincrement(refDB$protein$ID, ns = "ref"),
|
|
name = "MBP1_WALME",
|
|
RefSeqID = "XP_006957051",
|
|
UniProtID = "I4YGC0",
|
|
taxonomy.ID = as.integer(1708541),
|
|
sequence = dbSanitizeSequence("
|
|
MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGGYGKYQGTWIP
|
|
MERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSKVFHPLSSTKHPAKLAAATNA
|
|
KAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAIDGSISYEDIILDYFISESTQIPALLIHPPSD
|
|
FNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRVNHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKH
|
|
DRTVLHHIVDLALTKSKTHAARYYMECVLSKLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPN
|
|
KDGKTAEDYILEDERFRQSPLLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKER
|
|
DYQQAQVILRNIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE
|
|
ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENANKKAALASGI
|
|
SGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
# === taxonomy table ===
|
|
|
|
refDB$taxonomy <-
|
|
rbind(refDB$taxonomy,
|
|
data.frame(
|
|
ID = as.integer(c(162425,
|
|
101162,
|
|
5141,
|
|
4932,
|
|
4896,
|
|
5346,
|
|
5207,
|
|
5297,
|
|
5270,
|
|
1708541)),
|
|
species = c("Aspergillus nidulans",
|
|
"Bipolaris oryzae",
|
|
"Neurospora crassa",
|
|
"Saccharomyces cerevisiae",
|
|
"Schizosaccharomyces pombe",
|
|
"Coprinopsis cinerea",
|
|
"Cryptococcus neoformans",
|
|
"Puccinia Graminis",
|
|
"Ustilago maydis",
|
|
"Wallemia mellicola"),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
# === feature table ===
|
|
|
|
refDB$feature <-
|
|
rbind(refDB$feature,
|
|
data.frame(
|
|
ID = c("ref_ftr_1",
|
|
"ref_ftr_2",
|
|
"ref_ftr_3",
|
|
"ref_ftr_4",
|
|
"ref_ftr_5",
|
|
"ref_ftr_6",
|
|
"ref_ftr_7",
|
|
"ref_ftr_8"),
|
|
name = c("APSES fold",
|
|
"KilA-N",
|
|
"AT hook",
|
|
"low complexity",
|
|
"Ankyrin",
|
|
"Swi6 fold",
|
|
"coiled coil",
|
|
"McInerny 2011"),
|
|
type.ID = rep("ref_typ_1", 8),
|
|
description = c("DNA binding domain by similarity to structure",
|
|
"DNA binding domain by Pfam annotation",
|
|
"DNA interaction motif by SMART annotation",
|
|
"SEG annotation by SMART",
|
|
"Ankyrin domain by SMART annotation",
|
|
"Swi6 fold by similarity to structure",
|
|
"Coiled coil by SMART annotation",
|
|
"Yeast cell cycle review"),
|
|
sourceDB = c("PDB",
|
|
"Pfam",
|
|
"SMART",
|
|
"SMART",
|
|
"SMART",
|
|
"PDB",
|
|
"SMART",
|
|
"PubMed"),
|
|
accession = c("1BM8_A_1_99",
|
|
"PF04383",
|
|
NA,
|
|
NA,
|
|
"SM00248",
|
|
"1SW6_B",
|
|
NA,
|
|
NA),
|
|
stringsAsFactors = FALSE))
|
|
|
|
# === protein annotation table ===
|
|
|
|
# there are many! This, we don't code explicitly, but read from a textfile
|
|
# I have prepared.
|
|
|
|
tmp <- read.table("referenceDomainAnnotations.txt",
|
|
header = TRUE,
|
|
sep = "\t",
|
|
comment.char = "#",
|
|
strip.white = TRUE,
|
|
stringsAsFactors = FALSE)
|
|
|
|
# remove the notes column - that is in the text file, only for our reference,
|
|
# not part of the data model
|
|
tmp <- tmp[ , -(ncol(tmp))]
|
|
|
|
# add table IDs
|
|
for (i in 1:nrow(tmp)) {
|
|
tmp[i, "ID"] <- dbAutoincrement(tmp$ID, ns = "ref", code = "fan")
|
|
}
|
|
|
|
# add table to DB
|
|
refDB$proteinAnnotation <-
|
|
rbind(refDB$proteinAnnotation,
|
|
tmp)
|
|
|
|
|
|
# === system table ===
|
|
|
|
refDB$system <-
|
|
rbind(refDB$system,
|
|
data.frame(
|
|
ID = "ref_sys_1",
|
|
name = "G1/S SACCE",
|
|
notes = paste("Regulates transition from G1 to S phase",
|
|
"in the yeast cell cycle."),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
# === component table ===
|
|
|
|
refDB$component <-
|
|
rbind(refDB$component,
|
|
data.frame(
|
|
ID = "ref_cmp_1",
|
|
protein.ID = "ref_pro_4", # MBP1_SACCE
|
|
system.ID = "ref_sys_1", # G1/S SACCE
|
|
status = "include",
|
|
notes = paste("Part of MBF complex."),
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
# === system annotation table ===
|
|
|
|
refDB$systemAnnotation <-
|
|
rbind(refDB$systemAnnotation,
|
|
data.frame(
|
|
ID = "ref_san_1",
|
|
system.ID = "ref_sys_1", # G1/S SACCE
|
|
feature.ID = "ref_ftr_8", # PubMed
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
# === component annotation table ===
|
|
|
|
refDB$componentAnnotation <-
|
|
rbind(refDB$componentAnnotation,
|
|
data.frame(
|
|
ID = "ref_can_1",
|
|
component.ID = "ref_cmp_1", # Mbp1 in G1/S SACCE
|
|
feature.ID = "ref_ftr_8", # PubMed
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
# === type table ===
|
|
|
|
refDB$type <-
|
|
rbind(refDB$type,
|
|
data.frame(
|
|
ID = "ref_typ_0",
|
|
name = "UNDEF",
|
|
description = "Undefined type",
|
|
stringsAsFactors = FALSE))
|
|
|
|
|
|
# === save
|
|
|
|
save(refDB, file = "data/refDB.RData")
|
|
|
|
# [END]
|