From cf90c52a616c8a7c201ad4d831fc25ae9970c056 Mon Sep 17 00:00:00 2001 From: hyginn Date: Wed, 23 Sep 2020 14:06:58 +1000 Subject: [PATCH] Update the ScCC network data --- data/scCCnet.rds | Bin 5634 -> 6312 bytes scripts/ABC-makeScCCnet.R | 105 ++++++++++++++++++++++++++------------ 2 files changed, 72 insertions(+), 33 deletions(-) diff --git a/data/scCCnet.rds b/data/scCCnet.rds index 4e27f9cef4afc2bae567610bce443126adbbec34..7296382d627e541cef0c8b6cbc00c0f107718986 100644 GIT binary patch literal 6312 zcmV;Z7+2>XiwFP!000001MOWsY-Gogo|CLQ*}xeDE>XyElQZ6X^PkZ-vb>VHs{!v0 z+>1*{*#M?dRyI&XJ=(s`@%_B-uAJMaAGe|+$-qxQFd`Tsw3I&1B3|J^qD zY5VVg`mbMZR`p=}?!%^@5bV7MuvgtlSa07B98PWmZdLX8uET%ZtOnif6?8ZrzV5KD z`jeZ0tso5}I2yg~u&#zHcv0Bx-bvWsUcq^=KO#7&R&WJZ@HXMFO3hbO(_fpK^tTxd z2@c0U!b!Vr>6O62g} z&VvKyt z!D|ouEM}wXPQWplf+jsrSrYcU49RY+U<3yw{MQG&)i{FnXnO@6veSGc;Apaf7lgDA zT?8p_w$-c01+b|?NKuf&9yb=)uc>i=xP5U*!RG?luZEX|zv67|uQ;c=f-CqV+@cz} z*96uzP2u%`-R|}ZUJ$0mcr!RIfa%d{3x`((KiI6Q(d!S#Hw04$R0_My_G^SP?g~2G zs+yEnw+Kg*6}$o1tyj?CuV;tUD|k_urj%ze@i;k5i>BU4I2x?rk8s?5!(d$xR&XvH zSCcmyHay>JgIiUabh=(xM{t}32`hMQuuivWcNdN)cLB1dltC8$Hy@^z11%hlR`BLw z(`DG}&xQBf=z4fz*lt^@0+=R#@AFq+OTsjPQwsY@30W~5O#(Ph=KJAhmDcrLE$k&m zW)Y;)murPxntmBf%}5UWNl5(RR#m0kD1_-*s=$vntF&C_HHF=3dj)SDrUwciaq;lo zg@f%&!*ttH2=ke@xj5`~ zw+rCthG4%xxilOOO5o_4ASDM{IN-l71JmHM7!seGG*ExA)vc0&9&S=Gz)ixzD1yUe z3N1`s`3%Zm3t&BLQrHZZfJ2(mBG}!&FzluUW~DGqgbMt4v#Jtdn888%S1L{Xm_OWG z2%Eum!8Fym5Ds|?F&xtr>RxXWz#$DppFtJEE>EzV1_@KHV{Bnv0#hxUZZ_$0QYlQo z#%kdp5wq!5T{E2B*`-&dO0BD$Ft>h~CAUsvhv_EezeU_6FDnb1e>2S=!qQa)Emgy0 zqH^)11{&;$Bz2LkbCXVAGWt@LCMj)6@C&~san(+LMDZPk^Kv7C$cPjXw*xg4lhUS2 z3cclCC^>=?BBkiEwRvMQ%w#WHG6wy0Mij8i(NMOHpgpCm!o%68%qbals*4hZ$!9u} zdYmb4>iU~=Us)sh$U|;JneuId!i{Wyc6n~dWj|VF$=r}ybxDWIS9hW~^7Xj{RE$TGI{Dm9sw^cWmy+r%X-x|%6@0RnmHGnlv~o$9 ztgqxaC2Kp~qGQsJ^^0oyid1$6V%$U~^U#UhB7Zb2ilfzkToIECL+U0(o#o;UscI3a z6YZ81uN;-gEi6K$xHegC`SwCsCr9Y9A1s=0Y`#Q9^(#`gX2K20uF1QSj1ki(`YWk) ziQFXm6N@BucA4%+;YW7oL1f(G6_gtw>K8aUAo;r1TXpFujLuXa z@(Kz!BxNC$&ieTX8w$;SY-$LRYPZy#$&BQdop_%j8Zsi;W3+k!M{xS(;KK{u^inve|ENX=$R=AG(JF)aaDhrW{C=PBUTa=Nn97Fc9+TJ=J)pm+tLM@UgdLS`XuWj zRkV_GlSPRS6D6;b8@C>*6$yVEGYp%#q9Ug}vLO;{7H^31omZgF*`m~3i4#Ro`3cdO zGLWR~S9q>bbdV7zQi={T#5IqOnsmkL~|9+FSEL}KOWP0qV-YsuoX zrz<>cDqEB=A-Rk?OF$KCchP~bOi<#Sc!MB(7O5-RAjtP#`c?GRZ672SRfP@r3vsk5 z^&5*SH0oe~6tAzSWUC89-dd6#ZAF*;h`AQ`6h~hf%?$^=6e9^45^YLo8YjsgvbPdT z7%N?D8AK(e$Yeoq!b$RF(r8Q8t6bFy!u6%X;=-6X6(%*C@Tkx3D=m^rc3NpfJJ5Xvnb zaY0V75!1B8CjB2Q7abWC9iECYk0nP+BeG(TT9V|~lng1#euy!TGgKPJhq;kF>2yt7 zk9eCV!I(^;XsacRjVl(E8@DvyWhNNUXc<$W>7A>y40ps7JZK$3cP?|XGkS`1Vh>w% z#uvL|ak%*>)WVG*G+pG&3U;{pxP7jYN~1oRe$gH$%Ux3p+=ixUP5z>&Kt3EO*Z~zK zU>ALG6op%28M45&AZfge9!Y?qr|{t?)J56aWhw_2g@^@IT^|`Aa?(kvhBD9IG?dP| zyaSqLnT9E!BYz|}PFtHb@nwq=zn9O}<|*{B=uu0^=)31Iu8Y|o8)Q8p z(gV=TJ5Bk*SBw8=szIhc_*yo*&xTT+`bnVf_oHLVya<0Rb-6Y1nJ?ZbIoIIY58aByy+)I( zd-f~ar%23@RVf#Iu2PiU%K02*^2L?wy-=~~3->891*u2MstvgmljAxqQ7Abv5hC*s zeuHz#cu`~uvI=oH9T#1wYJ^d4UIKy07{h z@=C@PuOaqI28!z^lc7N_ zauY@Ao!Ac+9o*!hcuxZL`6;-}cgwwQRNb*C%8lwqC!!$R%u zv6j_TuhJbxQ9*RiSdm=C^;dZ!RBj;+w@bn+wpU?_b>lt7%vry{NiURWb?@!<|>0X3?Hio|~X9Xn_u9#+)nXW%Wh(xwI*q zQGV^{VE$g?)z^;B?>XYLazh5bzaRe9+I9GMe_H#4$JTzse!pk}i9{6_%x@^AtpwId}e3##IdcJ2q zpq`VFBosibU_}m z9^TJOd2g5J?ee(B<3P?G+xy@nu*>TS)9dYR{`+Uv&g6Oj$ncT0f68=wzu-T{?a1$# z@4NgS^d9o+?JB0%*XK;;0Xz0m+M)kb>374P=k~kN2|IxKfgjFK=$+9I{_G#y@Aqw7 zqXYKs5OQqeVm%Lp{zDu0FV_Eo(Z7Ep^Zl0R<9vgjn!SO&_4jOimm}~A_6c@mhv{(s z-{bcFzVaRR{t3SaJLvij{=j~&+tc{%_VNkO6XWfF$L%sbg&h&QqWbQB=c!>ljXuZ~ z_zHbHRQ`hRCI|2f@Q*HkJN7(U@B|+^LMQrxJa%Nh$A-bbHN#j3$S3H9{y*p6!C&|b zm+$X|uBY7I*Y%9&o2}2fwS!-=e!y4f6YSd&W4ALW+^>%(b_RX|>*?``^?NwA_bS6! zXS@eFf_{Pyj5|jToj$w=x{s}Y@LTi<>wkcC6S~y?xqM?jwoV=&V0@EDj}xBq`!2V; zh9MW$uk+pdfgHNO?r=Yueh9t$=S;WTjWxr2(jIbV^r_!>yJ_v*E}|cdU)lSo@;l`1 z*xs|}A?U2+{i!`|{aio5|0&Po0DOYHfd2QSAK2TDJ-_9@d;D_5^Yw8%%$I%GWytZm zj0-;PGF?u`9`hrUd&nv5#tUf&dSCKW<1E-Kjg!GYk2huAeSFbB*dg!N_~L#8c4BJp zgU^q-AJ{+FNATs8`&0RI|BZ2XnO?V(Q*MX2#p4&O&yhWiAMQ^<_m1IRrc3>m$2VAy zC;WHB!LT>(7moP7%+s9)WXGeaw_ux-`A8GXZK5j>_z31`z8q?`=Zoj*IKtGVf zbD1~Z6TNhKfxOxH8vnZ;#5ndI`0n!pKQLdLzsF0UZ)*62$Mbl!;(pv;K`&+9F<-aG zpj+s7yM^{og`RWa59VQXA^!Jx8unwu{(hh5?Riv9amB7VzwYsSu+wgz?CE?q z`Embd{=xSNd-gPbK(9RygmvGw@zws^m-%3P8`tf-_&JYT*O^|o+n^hAykYpM^b2_h zzu<4I9oF6V0Ug7YwS$~vKH^7oUBPePf1*F=sm;gpHK$Cs#_5hBhv@f_(2e)OAJD&R zPsoRj?|ggC@4H`LvvwQyclaNp*Y~%Nd0e$a9tXf4;61FL>9^WR_ph*fm>1sPvG(X6 ze0s)wa((FVJUk!unA^keX&k107k=O4&l7Iv>%7MO_&ywb2i^E3WI zPa)5)KO(1|-+j)%yFIY^yWE|~`_NmoH~zsNJ3pTD`~G*CkLG24-*{$N?IqUhd#1Gz-=fTmx_3P_`cIW&%=HdEcOC-k$Darf+Lzx(=~ z@q4b1W*1zpU?yey;O; zT%YYdkGnB0<~tIM=bE*LUqHV**013#Zk!4>zIz<=l*d8*ao~4&+EV*9@+0kpPv)H;CAkh zAvc(>(Czs^*hlCG=4t$Kdj@-9^YA<#)(iZ8A?vPf$OQYcR9g)AqVT$?udW)ye{ZEk@n!n)bJC2`Z|HmbHnz&+v!sq5AAjhV?32N zqZ9J#{NJ$lMz7lm>(Ao=>nA&(#NR=m(dF@|^$-2_d}zh=yZ=D{VjtapcMP9d|EBkT z{_=?X@wgd!gn3Qv??$i7G1kZ0xj(Y@dXDUIG{~#?|g` z&KY|=4u9F<@!(Hgj=-NsJU^EUtRMQrxM}YnM03_pUu(a(mBzt=SXn z4LXcJ9>>|!<8$y~gWGFefZIjzQQG_Z8NW1-t9s}9BYNihYUq>oqx(zW_u4%Cob1%b zS(9<0-<7?Ges=5$JB|Kq-kx{B{2>pZf7hNBkLU6SeS{oTJg-ApXKxR^!ni`8=P67d z)enH5nLeFxdyj7*FOVPW-}CJExIfP~Lf;^#7|-VK=W@ay^&jpp&@cLvbx?cf^@N}U zb_?HOra_J97t%CO)0;*&pp{&oA0S?_P#kMHh&_2oC8|KW>TaAWxApMLqL-~Hi>&p-R^ z*Pnj*)#uXXPrv%|&%Z#o|J?rft-bbt{`R;3`F9=r(?5RlwT}Id-~8^6zy0j@hTd+G e{qtv^ee%sGzxe%EZO1M8-~R*8e#|nh*Z=?+oBFx{ literal 5634 zcmV+d7X9fTiwFP!000001MOT}lU3EVUPt0NC#gy*Rp)mkwJ!VmnC!$HwGpZ-r}jBd zDWj$;Rf(q5s=VeGpdyHXq7emYA{S|(4ah}A0f7Jlp-pp1E>4(+3zrEL*mvJ9s&bj{gJ4uo(OTLvnkSu%9{hK`aKREN zm$)0g?f(Cd|NGHUnl-wM9p<&p!iTkV30A|bd$(Y-Ge2-R&Dx6_{&*;D6x}5_FRwg_Oe*12=oYKPBeT0k}|U|Z>a9+v(4g>|L-_Z@5~L7GQcq-_gkt?qpVwYR!=u+>@YP~}$h32T|MSG8Pu zScWxak!tl!{aSB!XAM=1G+a-c^8D{7AF$c%-gEfSP@1+DA#5)Q7&?)=3JcA$!9g0v!iU3Y9!_C?VXLzQ=LHtoAmGnr zt!)X;4-7XT^@QObj6uRy<8HxrvFKoyHI`r{Y^R;Y26H`ydjyBmFyFtIFsMQ}*lsO$ zn1xa9!iDY50)$#b8EVP-2EuTytzXz`Ex~`mywI=~oT~V57Zf#T3tecM-Z{+lvx9?S z;@K~(2b(?S7q)|_{$ZCLH)aoOfo13us%6xeT?jBN{XL?0h0QZ;bY~620J0a%>S{(=Yf!LV>)r!6oQAs4FKh-r zbOzX%Jq-5~_6rq59n8DcFe~bvdk!CUdeojh3?rEV!YmDrFwX#67R=QYvR1u&r%>;7 znhP9qZZQ~hzp&P>hAuA)_`6_XQ;&HbVXLM(3hPLH!YprgW(~tIAcn2HJ6o7%>7ZcV zF<_`icNG@3?yO-`k6FVaOEs*SP>rf66s?M)*)d=%>)s8xd`K;kESHV+o%w>T`mA9V zZcth7!rE*=Fx>7gu+h15SSz{~Y|R_2*EUN=2Mnbyf_Yeg?HASy z1seeeR}N>g`H~%4tmK4~ft8 zvLEtzR3akAQb~+f)K|E;qlif8AtkS5@}=dVDfeuiR@Xs1Zg)7h1GPXmScw@@^tECGpQuK9EFY}6OZT^2bIL)RkhGrlRC$OR8y*< z!jQ;7so0;YiR5rs4yy$mI`KofEWQ7OcGD2|$xD)3B&$LNLK0E(Rz!Bvee#23WTJw% zvMRJ@&ho4em(T=h3C@lRXw0^aC7%+Nj?|evJIN%OpxD&wgE;EQ z%M3m=S1b#4pXf{#Upn1L2^n#L%Kvi;)!6^47?=Wfds{T6r|cHBE|7>uYi?H5M`0f6 z5v?NH9+t{lY@fBmwfzxQUdhu%#bc!!WEtpCDaE36c$d;*vh9zEeO6vWhnF15s0B&A zjzq5lq6KFnESnJ>dy}Ul*G5OUq99*65whO2`a$YUzKqI;tzt9of~h zT=6v`3Mxs_5)FBFE=v!q7>R;yN<1SQONkgAl1@{#=tQAfn(Gu#C678=d{{aPTUH0V zX&9WPx?h#TDlX8<80Eh-DvBp5N#`h6D=clR{2bCb zrxgUljF6!q-KX$_mMUoTl&yny8PNowJhX5B{jB&&JgCQH!!J4hGHbjQpwX12Pt5-(|_VGaga{0 zQu;vLry$=hbaPTe#d)fYXo;m7qpcbwIq?wz+N73MujAOGvey@AzLb~@j46FBM#NP_ zwS_!71Q6zBoc?oA#0fryA1&pcRNzYfh$m==TRGVqZ#9TMA~j!aPO_>AnB*2-yN zJ#qwUB|R!TsP|D(QIX<0N)K({ZRb|hkzMSvSC=S8dMtXyg_;8GqtunpzKRl2eKIQy z)zDUji?@0+0BN_qpCPt{pHLG`p z5$PyDB8jCWoha$_H`R+=nJ%I#rQ>om1sF%SepOe=vK7ena=4TcP;{-r0OC4KayGVl2M(D{K7HO#MJ`Th zYpU2qo8l?cRIw<<&lE88o-y5Wtn8Vok~-e7Qt7^ubW)sNyZ%(ab~a&GKcZ}rz13W# zj`o?`JC4ZI8!PFg-F@vGApIqICR0F$ks?N3*3pYiy)2=1P=ZSb<_QHdo8;NarA9?3 z5;L{$nqr1n>WqEn5!qE{ZHKk7!r?p#2ayWhsrEQQ}&+TKId!sPLx;jd z%&O07Q`!Uar9NI`vr9XCCiBwCzHQ9je8P5ly}ML2H@j0%>R*$5!vnivD8ge+#JmeHu}4IAu#=isa>#mQD^4HVb;ttS z*|EJE9%+i|NC=x(HWhoxW_8WHN8-sLgte3$M+KB*F)gOhf3;u#Bnt}AkR^_mwK$|4~&_5|{n&0Q983B6vep|Mq=H81Vi z3tIJ{NJgI@R8kioRnsrsDKzB3CR0oH>L}t2uRhW?M~_Oqf*|roHevPp0vAJ+#70_X zDQxT|^(%SEm!+D&(My%8kGaLf$?4fzCFvqBsaK`7Y{Y7%c7{Z+dNNF|oq5^pCVA$9 zRm273*&Z*i(*B6N71N67tbgS*o_5e{MyioqSvN;7uTo0aqa%*5*gjMA)u>jhUM*Wu zP|tp?sE>cuO?};Ol6?28_h2Qx8>@yXzx&{RQ`2#4{V*bpzvj!(B(+^)beN%w5~i&sFq!7T+Iq*BOjy6l2)wt}U3) zWzgmluGcV*30#NJmyGFxyEbDyUk0>Yi_g~s9teFlxobbZyNdR=yX#uOn-AP|3$&8? zY{dQ30iSN;nK$tN4tz@6uV7B6(asKB!iV?4%Ri&N5zK8G^qoRGw{Y(<@MA6LxdHud z0gcY#_sgKuHavF??c6|{e?*(U?Wth=!jD(craXJfU28D+ouKt7=rx6VZ(%-OhBxB* z?cnRqfEWJy2;+PS^WOtHZ3^!B44;#riS&6MwDh#!3Oa8aM*358lVU@;QOeF|;uX z-n|HVZw7sDV2md~pZyrqO^oGL{B{`pK8E?63Ff#OeEcKu8hq}?=QzeA^ZNi}+=D(= zfzG=E`gr}h9q{XAjC~{SS%-1SZ(GpLxun#qQ|Rk)oCfH!+E51ISt;FH)L zuOGL-Z*M18C8fRi0RN9-t}o)=YoPBLTqC&uPvF@YcyS$b*a@0UgT^<||0?j;&;NE( z+R$D2lrf1s?E_vF(C{31;pZ=7c>&{Gg?>DJeE&Bw_El*6W88B9pJ&0ZL%2j|-cG*O z#V^5^xAFfZ=pu5l1AUx9`xChLMg0Cb#wPb{2?aueA&l=9cqp_Gaj(~P zFJnT(zXtre4jOu!J%V<=zye81^7ijka`Z|a2ZleEjjC+4Deow1UF}F3K+Zph068&BV&31x+^4(ta z|8i2sMUxnh=(*Uy&FE(w?d?T>LQ5Z$tV>ECdku8i4?e$vdr#s15%jqlzl%-ZmXtBc zX3X_0_$PBej%P(CU&2_<26OTDQu^3|w&k}EF%SQ&(DMjrwF7fKi8iN_GNzsaeXn6& zNF8<|4~SF)nYr zP6qe+nDlf~+6QkteC#K3^bW4Kz!y(zZ%2KcENxvyyVL0FD885W_k!M^V7~i;?>`02 zy}v)1l)g*kWK~k?{4{=h7qsLYGneegSls=UxGyrCl#eGPc#Y$J;;8dvEtAg75s?KTb-U>f4_{ zeO3 zR`BHAq}26s@b#6XjKxJR4q)s)P8ZqPgwJi5{~65J%i3(?my+KGGRdKu#vd;S*MK8|_s3+U+m=7oS?-cES^5Zb(n=Wl@@$MBu^&0CT( zFBt)iygj*q@ArTMaa=@yx6#g*pyg{xX~TUUH;p-o?3_Y>-lnd@ z=RUOgE}q$n@d{nv2W^ihrG5ALuE?R!1J>c**YUXxd^>~Rg|FT(zXATdgEmg%xl5Su zt9Z`mU2kGs8!#`CsR`V_3gbP3`3rr;KKvyq{q$bY^W&tniC^G5PggI`LZ9{c{}S30 z*}MWeznhdkcnb4Afcti03}^BD)qrPflhQt%P0E;gEa0`YaTxu|_nR=*YoMR^S?BTn zi})NxAJ>yo_qKx8KF$-Go&%Qgzm$}6;`Lx0be{wtUIU#^f@XPx5zc|X_rx>5{pGKpe(K3z zJom&i&pt&>e*4*He*Z(??04Puz#rV7fBy6D3HSZPZytY+`~LUmfBBnVJozi2We(xr c?>zbV^N;`VSI@eJ9s0-r18{o#Z2zhN0C4UaqW}N^ diff --git a/scripts/ABC-makeScCCnet.R b/scripts/ABC-makeScCCnet.R index 81d0a49..b464d54 100644 --- a/scripts/ABC-makeScCCnet.R +++ b/scripts/ABC-makeScCCnet.R @@ -6,40 +6,70 @@ # Boris Steipe for ABC learning units # # Notes: +# +# The large source- datafiles are NOT posted to github. If you want to +# experiment with your own code, download them and place them into your +# local ./data directory. +# # STRING data source: -# Download page: https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae -# Data: (20.8 mb) https://string-db.org/download/protein.links.full.v10.5/4932.protein.links.full.v10.5.txt.gz +# Download page: +# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae +# Data: (20.1 mb) +# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz # -# GOSlim data source: -# Info page: http://www.geneontology.org/page/go-slim-and-subset-guide -# Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab +# GOSlim data source: (Note: this has moved from GO to SGD) +# Info page: https://www.yeastgenome.org/downloads +# Info page: http://sgd-archive.yeastgenome.org/curation/literature/ +# Data: (3 mb) +# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab # # -# Version: 1.1 +# Version: 1.2 # -# Date: 2017 10 - 2019 01 +# Date: 2017-10 - 2020-09 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.2 2020 Update. GO Slim Yeast mow at SGD # 1.1 Change from require() to requireNamespace(), # use ::() idiom throughout # 1.0 First code copied from 2016 material. # # TODO: # -# # ============================================================================== +# SRCDIR <- "./instructor" + +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> --------------------------------------------------------------- +#TOC> 1 INITIALIZE 58 +#TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66 +#TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96 +#TOC> 3.1 Intersect interactions and annotations 122 +#TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128 +#TOC> +#TOC> ========================================================================== + + +# = 1 INITIALIZE ========================================================== + +SRCDIR <- "./data" if (! requireNamespace("readr", quietly = TRUE)) { install.packages("readr") } -# STRING functional interaction data +# = 2 STRING FUNCTIONAL INTERACTION DATA ================================== # Read STRING Data (needs to be downloaded from database, see URL in Notes) -STR <- readr::read_delim("./data/4932.protein.links.full.v10.5.txt", - delim = " ") +# The .gz compressed version is 20MB, the uncompressed versioj is 110MB - +# really not necessary to uncompress since readr:: can read from compressed +# files, and does so automatically, based on the file extension. +( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") ) +STR <- readr::read_delim(fn, delim = " ") # Subset only IDs and combined_score column STR <- STR[ , c("protein1", "protein2", "combined_score")] @@ -48,22 +78,26 @@ STR <- STR[ , c("protein1", "protein2", "combined_score")] # sum(STR$combined_score > 909) # 100270 edges # subset for 100,000 highest confidence edges STR <- STR[(STR$combined_score > 909), ] +head(STR) # IDs are formatted like 4932.YAL005C ... drop the "4932." prefix STR$protein1 <- gsub("^4932\\.", "", STR$protein1) STR$protein2 <- gsub("^4932\\.", "", STR$protein2) -# head(STR) +head(STR) # get a vector of gene names in this list myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene # names +length(myIntxGenes) +sample(myIntxGenes, 10) # choose 10 at random (sanity check) -# GOSlim functional annotations +# = 3 GOSlim FUNCTIONAL ANNOTATIONS ======================================= # # Read GOSlim data (needs to be downloaded from database, see URL in Notes) +( fn <- file.path(SRCDIR, "go_slim_mapping.tab") ) -Gsl <- readr::read_tsv("./data/go_slim_mapping.tab", +Gsl <- readr::read_tsv(fn, col_names = c("ID", "name", "SGDId", @@ -72,52 +106,57 @@ Gsl <- readr::read_tsv("./data/go_slim_mapping.tab", "termID", "status")) -# head(Gsl) -# +head(Gsl) + # What cell cycle names does it contain? -# myGslTermNames <- unique(Gsl$termName) # 169 unique terms -# myGslTermNames[grep("cycle", myGslTermNames)] +myGslTermNames <- unique(Gsl$termName) # 169 unique terms +myGslTermNames[grep("cycle", myGslTermNames)] # [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle" -# + # Choose "mitotic cell cycle" as the GOslim term to subset with -# scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"]) -# length(scCCgenes) # 324 genes annotated to that term +length(scCCgenes) # 324 genes annotated to that term -# sum(scCCgenes %in% myIntxGenes) # 294 of these have high-confidence -# # interactions +# == 3.1 Intersect interactions and annotations ============================ +sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence +# # functional interactions + + +# = 4 DEFINE THE CELL-CYCLE NETWORK ======================================= +# # Define scCCnet ... the S. Cervisiae Cell Cycle network # Subset all rows for which BOTH genes are in the GOslim cell cycle set +# scCCnet <- STR[(STR$protein1 %in% scCCgenes) & (STR$protein2 %in% scCCgenes), ] # How many genes are there? -# length(unique(c(scCCnet$protein1, scCCnet$protein2))) #261 +length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283 # Each edge is listed twice - now remove duplicates. -# -# Step 1: make a vector: sort two names so the frist one is alphabetically -# smaller han the second one. This brings the two names into a defined + +# Step 1: make a vector: sort two names so the fiRst one is alphabetically +# smaller Than the second one. This brings the two names into a defined # order. Then concatenate them with a "." - the resulting string # is always the same, for any order. E.g. c("A", "B") gives "A.B" # and c("B", "A") also gives "A.B". This identifies duplicates. -x <- apply(cbind(scCCnet$protein1, - scCCnet$protein2), +x <- apply(cbind(scCCnet$protein1, scCCnet$protein2), 1, FUN = function(x) { return(paste(sort(x), collapse = ".")) }) -# head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc. +head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc. -# sum(duplicated(x)) # 1280 +sum(duplicated(x)) # 1453 # Step 2: drop all rows that contain duplicates in x scCCnet <- scCCnet[! duplicated(x), ] # Confirm we didn't loose genes -# length(unique(c(mySubnet$protein1, mySubnet$protein2))) # 261, no change -# Network has 261 nodes, 1280 edges +length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change +nrow(scCCnet) +# Network has 283 nodes, 1453 edges saveRDS(scCCnet, file = "./data/scCCnet.rds")