From 6f60508e92cc822324938446e825fa9cddc0cf84 Mon Sep 17 00:00:00 2001 From: Youssef Mansour Date: Tue, 12 Nov 2024 15:48:16 +0100 Subject: [PATCH] update open_lm --- open_lm/__pycache__/__init__.cpython-310.pyc | Bin 139 -> 139 bytes open_lm/__pycache__/attention.cpython-310.pyc | Bin 5816 -> 5820 bytes open_lm/__pycache__/data.cpython-310.pyc | Bin 21889 -> 21909 bytes .../__pycache__/distributed.cpython-310.pyc | Bin 2941 -> 2941 bytes open_lm/__pycache__/eval.cpython-310.pyc | Bin 1124 -> 0 bytes open_lm/__pycache__/eval3_seq.cpython-310.pyc | Bin 0 -> 1837 bytes open_lm/__pycache__/evaluate.cpython-310.pyc | Bin 3939 -> 3939 bytes .../__pycache__/extra_funcs.cpython-310.pyc | Bin 0 -> 6320 bytes .../__pycache__/extra_funcs2.cpython-310.pyc | Bin 0 -> 6119 bytes .../__pycache__/file_utils.cpython-310.pyc | Bin 14764 -> 14764 bytes open_lm/__pycache__/logger.cpython-310.pyc | Bin 771 -> 771 bytes open_lm/__pycache__/losses.cpython-310.pyc | Bin 1150 -> 1150 bytes open_lm/__pycache__/main.cpython-310.pyc | Bin 23020 -> 23020 bytes open_lm/__pycache__/main2.cpython-310.pyc | Bin 23056 -> 0 bytes open_lm/__pycache__/meters.cpython-310.pyc | Bin 3443 -> 3443 bytes open_lm/__pycache__/model.cpython-310.pyc | Bin 15249 -> 15256 bytes open_lm/__pycache__/norms.cpython-310.pyc | Bin 5027 -> 5027 bytes open_lm/__pycache__/params.cpython-310.pyc | Bin 18964 -> 18964 bytes open_lm/__pycache__/precision.cpython-310.pyc | Bin 651 -> 651 bytes open_lm/__pycache__/scheduler.cpython-310.pyc | Bin 1867 -> 1867 bytes open_lm/__pycache__/train.cpython-310.pyc | Bin 9619 -> 9619 bytes open_lm/attention.py | 2 +- open_lm/data.py | 12 +- .../make_2048-checkpoint.py | 255 ++++ open_lm/datapreprocess/make_2048.py | 4 +- open_lm/datapreprocess/wiki_download.py | 2 +- open_lm/eval.py | 44 - open_lm/eval2.py | 96 ++ open_lm/eval3.py | 116 ++ open_lm/eval3_prop.py | 213 ++++ open_lm/eval3_prop_2048.py | 139 +++ open_lm/eval3_varylength.py | 118 ++ open_lm/eval3_varylength_2000.py | 124 ++ open_lm/eval4.py | 139 +++ open_lm/eval5.py | 162 +++ open_lm/eval_redpajama_seq.py | 167 +++ open_lm/extra_funcs.py | 214 ---- open_lm/hf/__init__.py | 3 + .../hf/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 319 bytes .../configuration_openlm.cpython-310.pyc | Bin 0 -> 854 bytes .../modeling_openlm.cpython-310.pyc | Bin 0 -> 5904 bytes .../tokenization_openlm.cpython-310.pyc | Bin 0 -> 525 bytes open_lm/hf/configuration_openlm.py | 24 + open_lm/hf/modeling_openlm.py | 194 ++++ open_lm/hf/tokenization_openlm.py | 18 + open_lm/infer_proportions.py | 57 - open_lm/main2.py | 1034 ----------------- open_lm/manifest.jsonl | 200 ---- open_lm/model.py | 11 +- open_lm/params.py | 3 +- .../__pycache__/__init__.cpython-310.pyc | Bin 160 -> 160 bytes .../__pycache__/head_rotary.cpython-310.pyc | Bin 1889 -> 1889 bytes .../__pycache__/llama_rotary.cpython-310.pyc | Bin 5794 -> 5794 bytes .../__pycache__/none.cpython-310.pyc | Bin 338 -> 338 bytes .../__pycache__/rotary.cpython-310.pyc | Bin 3709 -> 3709 bytes open_lm/run_bench.sh | 19 - open_lm/test_class.py | 76 ++ open_lm/train_class.py | 68 ++ .../__pycache__/__init__.cpython-310.pyc | Bin 145 -> 145 bytes .../averaging_utils.cpython-310.pyc | Bin 3758 -> 3758 bytes .../make_wds_manifest.cpython-310.pyc | Bin 2841 -> 2827 bytes open_lm/utils/llm_foundry_wrapper.py | 11 +- .../__pycache__/__init__.cpython-310.pyc | Bin 158 -> 158 bytes .../__pycache__/hf_config.cpython-310.pyc | Bin 0 -> 1831 bytes .../__pycache__/hf_model.cpython-310.pyc | Bin 0 -> 7376 bytes .../__pycache__/hf_wrapper.cpython-310.pyc | Bin 1393 -> 1393 bytes requirements.txt | 2 +- requirements_test.txt | 2 +- 68 files changed, 1938 insertions(+), 1591 deletions(-) delete mode 100644 open_lm/__pycache__/eval.cpython-310.pyc create mode 100644 open_lm/__pycache__/eval3_seq.cpython-310.pyc create mode 100644 open_lm/__pycache__/extra_funcs.cpython-310.pyc create mode 100644 open_lm/__pycache__/extra_funcs2.cpython-310.pyc delete mode 100644 open_lm/__pycache__/main2.cpython-310.pyc create mode 100644 open_lm/datapreprocess/.ipynb_checkpoints/make_2048-checkpoint.py delete mode 100644 open_lm/eval.py create mode 100644 open_lm/eval2.py create mode 100644 open_lm/eval3.py create mode 100644 open_lm/eval3_prop.py create mode 100644 open_lm/eval3_prop_2048.py create mode 100644 open_lm/eval3_varylength.py create mode 100644 open_lm/eval3_varylength_2000.py create mode 100644 open_lm/eval4.py create mode 100644 open_lm/eval5.py create mode 100644 open_lm/eval_redpajama_seq.py delete mode 100644 open_lm/extra_funcs.py create mode 100644 open_lm/hf/__init__.py create mode 100644 open_lm/hf/__pycache__/__init__.cpython-310.pyc create mode 100644 open_lm/hf/__pycache__/configuration_openlm.cpython-310.pyc create mode 100644 open_lm/hf/__pycache__/modeling_openlm.cpython-310.pyc create mode 100644 open_lm/hf/__pycache__/tokenization_openlm.cpython-310.pyc create mode 100644 open_lm/hf/configuration_openlm.py create mode 100644 open_lm/hf/modeling_openlm.py create mode 100644 open_lm/hf/tokenization_openlm.py delete mode 100644 open_lm/infer_proportions.py delete mode 100644 open_lm/main2.py delete mode 100644 open_lm/manifest.jsonl delete mode 100644 open_lm/run_bench.sh create mode 100644 open_lm/test_class.py create mode 100644 open_lm/train_class.py create mode 100644 open_lm/utils/transformers/__pycache__/hf_config.cpython-310.pyc create mode 100644 open_lm/utils/transformers/__pycache__/hf_model.cpython-310.pyc diff --git a/open_lm/__pycache__/__init__.cpython-310.pyc b/open_lm/__pycache__/__init__.cpython-310.pyc index 9d49405838d836d336fe2656c2509255956ca2ed..75193ebbf87a7cf626df5f96a330aaabc8ed2b59 100644 GIT binary patch delta 19 ZcmeBX>}KT7=jG*M0D`^0&rIZQ001ia1y=w7 delta 19 YcmeBX>}KT7=jG*M0D?)S4HLN=046B}S^xk5 diff --git a/open_lm/__pycache__/attention.cpython-310.pyc b/open_lm/__pycache__/attention.cpython-310.pyc index 99fd3482da02fac82f08af91512b371921862d92..8a3978d12d71e7a4fe8c8f18374b24c8326ad4e6 100644 GIT binary patch delta 76 zcmdm?yGNHdpO=@50SNZ~K9i=kk(Y^w(QmQ<&kEW26oy)k8ip)}6h=uP&Sol#OJS>J d&eN%3E>4|%k;k5~bF(0CJR>9D=0ZMq4geNp6g2<< delta 72 zcmdm^yF-^ZpO=@50SG3QHl(R<L#t#`evkyzz{TT$@Yz+&KWBDiW^% diff --git a/open_lm/__pycache__/data.cpython-310.pyc b/open_lm/__pycache__/data.cpython-310.pyc index e5d3d2ace6ef8004cbe965fdb53d2b4233d24775..29bea0dbbe842bbdcb7e871d6599723fee3e1f67 100644 GIT binary patch delta 712 zcmZva-%C?r7{~X0&)nA9){@ms$<2w@Tt8@{w#~(`o64}UDM5iIs35JBL%b5_10WR zUe4Fd3v9~S+RuQS?t33%Lk8E$`GGoj2->j1C3ARiEW}nGs}s zWUMd>j4$*gd>moQhXYt8W2D2h%H<-RjRbecg>uZKatn+w3VogKM&|afsY18E;{QEq zqJ3}OGPl?sq%^$d!Z*e`V})L|AF7{I;Xi%b(hAnG%fq96X}y2=s$L*-G=}qZDtcI% zmW_TFrcFM-MOR`+lrfKDfjwE)$Y0xLtI{eppQdzb@_o$Xz%SbBT7gDO=icHTO~;$z zlpo^Cwcm6gaj?-0)oLfhsi0oS#&Ov$J4X)2F3Kdlo;4ogQbSxY!38gBMYZt3r8!{{ zx{F>W{B9#R>D2iL(?=OM7zFd-j$ zFi4_9Phk+fYDUe&>76y}bZa z3!q)nG@~S*yK9T*yPs%hZ6{O_^@=y=5y(pgsQ{yJL!OlLP>^@El9vn0u2u_bQu4~N zIbhes34ChH8cbjU71#{kbT#o_yB_p!^^%lv#_*@ngA@B-Ly@04&<^kgnd1t)=T%2h zhHd_$Km@ly+tAUsW)O4z&8#nHl@;F#_^Q% EFHvO0ga7~l diff --git a/open_lm/__pycache__/distributed.cpython-310.pyc b/open_lm/__pycache__/distributed.cpython-310.pyc index b5a5967da56c746ac408fa4c0089271e1ce9e264..c3b16e70eb46ad60839f0f01b06cc7339bd4d5e9 100644 GIT binary patch delta 20 acmew>_E(HMpO=@50SNZ~KC_X#h#LSzp9X>e delta 20 acmew>_E(HMpO=@50SKm)Hf-cB;syXd;01>O diff --git a/open_lm/__pycache__/eval.cpython-310.pyc b/open_lm/__pycache__/eval.cpython-310.pyc deleted file mode 100644 index e7fedbcc371673fab90437a167247e6716b0120e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1124 zcmYjP&2Aev5GJXWwAx6r?AWrCI%%350vE7kJ4S&5MbK78PDw8f&~)H}pqC?QtNjZ} z#XqM4LXSmnJzK{f`w)GJz4oM6XbPaNlF>uhRJc zXpr$Gi9O+rX{LG1pj<`F69IykdL^zT9(YTg$5p5i=f(BJ2Y<;`NpC=dGZZ5>;}GZ? z2J)V+6E?L^*wPKcwq}HDIw0)md$6vXBTo7TdKlpPOh5-a*Y|a6&Pk(vns(?l&2@)l z{g*my;^vQAu%#JvaqEQrFo12`KIO23?D_%j%=v2f4mNNHgPAxL@DM&C@9v!a`iwH{ z;KLa|t-vFA43CNb7$514xuE#`XQA&8#RGQToiliXU3_x=$((J|S>I7#Jm#@kg|pN@ zWcIgn&RIFXCwc!4pJI1LjOO36z0F10-*5JK5#-8Pl*)`OSsMZ^W&I#T^O z8`=JC{K0YQd(k`kS`Lm+2Cv`97e@zg`r#PENuEWivwh67a4Zjdxx0G43b9JU_6NuG z?c1?3djEjN(c$Mkrwn-ERI68Sq2eywUJd-nN@rA*LUkDps>3+T3^ZIe!8E!;1AZA( zPVf@ehnWdcrqe{;RA?81s#*L?WA7PM9l6KQUe(s4=BdP*@)u~`lFPQ4O34N1c(%$H zD%8QduJlUQ&|_z&N>F&^1wc1PBS-mEeo(PIQ;tfvlpq8jvQVAf=yi+lWsE7s&P=Rt zNYAKr0@dZo5G)s&D>nwej8vXg2$w%LX5WLRP(OXO{ z^{E!6`PA8>P6^XPuuxAkIWkHYm2Bz?E}=$_L|uTF8`OmtIZTRbZs-DkXJ(_Uv4p=r z?Ni1Ro2wA}m)X=>9QIc|yIYa+?d6w+{{jqzpqmh4LcpEY@t`y_;-#_y)^$PPrn ayP_sM;ghGs+dL3$zAM^d_aE_7v;H5&Dm+I3 diff --git a/open_lm/__pycache__/eval3_seq.cpython-310.pyc b/open_lm/__pycache__/eval3_seq.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7ee1576ac717dacfb572aa3feb51e31dd69d616 GIT binary patch literal 1837 zcmcIkO>Y}T7@pZ(+w1j5630m#C!|}NFDG#=eYh##T;qp(KfE_3+0lq0u&C5knI#vmQa>>ft9)ROz;w$gh^hWBWw!g z1j-7^NtDxkidA`q&G2b93mM2_^&%86=Ae8jHF8mdDb(gUVRe{)pM3d^}PIaG#SS1FI4EzCWdu$oV;b#{hM<3@b2?C|6+(s<6ywVf9?Y$`Wbie?mI< z50L(H2kDQ9bUj7dK%_IUjz}ASBAvd2^znbz)9T+K&9}*GDn7i8^))@7K(rev+D$|| z3mb@bGeNry>vIG)kew}9g$C-Is5fDa*I)~?nn~6gpNB1IE)ZU?622h$+K+ms z?Ypk!@ut)F;3n3ftiVCoVLyoazBkM@o8#lmkL0PQXSrZTzBv$JcKy(V6U!M`GPR+^ zT67jbto6VNhSg>>#WOyj8CX$oxcLlMxj*WGna2BevfoVVn062BqrmTb(J*Tdxb^PN zNX0u3S6Zijcq9VLhSr%s5CXa_Cmgo865Z1|D{4hrR%-MQbxidy literal 0 HcmV?d00001 diff --git a/open_lm/__pycache__/evaluate.cpython-310.pyc b/open_lm/__pycache__/evaluate.cpython-310.pyc index 57be7115f85edd344d806546966ce69f56a8d30f..01838a2847bb55631e1a588f49eb448eb968eb59 100644 GIT binary patch delta 20 acmaDX_gIcQpO=@50SNZ~KC_WKf*$}uvj#{2 delta 20 acmaDX_gIcQpO=@50SKm+Hf-dM;0FLZ`vpz_ diff --git a/open_lm/__pycache__/extra_funcs.cpython-310.pyc b/open_lm/__pycache__/extra_funcs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba2702aeb9ec4daaca037559b7069c3e141d10d1 GIT binary patch literal 6320 zcmb7I-H+VXb?3b#XE>aXot<4tD_M2eRJJ>gXTL02jw`#3_0hP9RiRQGgh(*p&gIN< zcZM8Ya zlA;uy!MVJ=_uLPjd(Q8ibEA6Q*6_RaH_gFce@@f>o?edsbiDi$DE@5}LKAGLxy6&Y zOuciLtEcYj>S?%!dYW!YJCp6}Q4@tUk0yRkw<|;nsvHO7CfI-LHvqp9@P= z-eYb<*rJNo1yK`qJe#5+7VumYO|gh)OR$?-yLBIX(yq5zZjU_~`&~~CV!X8yKTf*6 zp%=$NzdMSAKg`)ikC^Ab8Y&ggWl+4yG%e8v+C4VK2WDEA367Er9i=XeDNBq)Gh;_= z%9Bz?e0ruImW7#Er!=(ciLq1Jv4t*5AF!!0(57a_uV~-AF)d+aH7gylW8BnMJJP=W zD$_DOE8Sz?(5L08HMKG$D`W05`s-O)=to?XPm{p3_aT@*Kdl`$j!8~&VunRg#uwPCRzaVKBPk?RX<^Z7ielT5sF$?T(`$OyV_v9QC%l8>{2wKF4{aU;YUb%@$gV zdombf%*`9W(jEArFTKPU>h0p5cR)?uB--`E*Vi#!YW60=dkrd~i(n*i%-a3W@Vig7 z_Ohhaa#JaY+j>M_x??Zd&a2^MROpc(=VlVg-ga&bBTq;&i<~D>UK0MDpy%g24D<5M zsGp3y{oMXa6b^3rVI0Z497f%N^hB;xj}avPC@*@hd1SUSjk&xXmoyU=40? zi!C#oHSq3$KF|2wvp?c@?TXD-xWx_d%I7gO{lZV!!P7LQM~5{3(MOZ%Dzp77oMHyU z-C=@V(h`2C3z+N?pYj2alY0w$aGoZ7ep8-7!`W(ms{VeG7pq~TLxO1TAK+l6-0R23u4SEsu z1<*^N4rmp$_920awqs;Wtc#5gIHT`04@-F8#1|S5-d@I=nUpg_oWIAveG7n7&P-K* zWm-;`5^JZD>0&)A0VFPbU`(yV7O!O1PL(8cPeDX@#?Zi5JNR=3aA8_n2hYW<@{r@} zfRSnCTb0l(o_lQXQD|ubTK@7lv>XQ>q1}9ho8H0ycgOjut^MlteEITMuY9Td&RaL% zdh;t+yH~Eh^Va37H-qL2nihQT$E5Je2<0S98=cO`ixXdV$B|54Pn(@i2xRGX#=h)? zzua3MROv@NZETAuseQ)j3r~^g)O=3V>rEu!+}Ya>hQ0%B4{_+ue3sXoc0K1&oR|IZP9PzblJr6m zjq)-*$%R-WZ`T)rj3G#JWX29WZElbqum%Me^8xDPO zqhPv9-`KX@8u`+0;qGF$jF0YOg}i#q3iEO=8o>;NYZoHuf=ku$VN}m8$feuISGg87 zQ4PC?(hT9ReC3~t;yKz>QytJek_&wKGFs!mMPc)r&UMcCXBOw5qV{7&2LA+Z=o4=8 z|4~%pA9JEI|B)h#{}7JohrGi7TanHGi&y!7Dys4S;6!!)14Rw~@0@6X|C^#F|2`*L zNsR1{E9ykc|l=h4^Ex|3IkKRseyQW+haGr=1+3n?w$G@mX zs<~*i8p&u>q8}QWmX%iEW{)_4yL9~;8A(|ZAjQ54v?PVsZBw4X1C{`cMtovIK?f8p zjq@_1mS8j)waYm>$l1s>wgx_wAa+a1Xbcp|S>PHJPCc#_HE8cnHx4M$*&kr~_$&&G z8LS3sqHN-iXoK-j4SfZ5c^wRoNR)yK3MQ09e+QLxA<+&YNs5{iPFcW1_eahNE>b^m zir5+b$PecIpdi{84lVSO8n|dw_rgK*fnG^A{(K5~q*oQJ{J0hA&3}lAb7ZgdYYzHLgDuR%l{{ zE8G6@+1n)?*Y+d{hOo?0B7MI=h{bC#*6UGnjaE~jkS{!3uxO}U-S7EhLh3emEo@l@ zt&*ht_$ij*f(bzO(HEaY!I{mMad-ta*jZ%|@)o)tIXW^1a_#EqhN#RBPOxQ&4sS33 zBwR?!C}GQ3##c0orjWvMv=}&C11%;LpCh5!L=ViMilaxuXTFa#3x>h7k_n>_#zjrn zE@>G*EFV#fVy3@qJNVt&4q}JmWBOPp0o=xp=M3Q_oT%@>J8i)doMhWe98U;mG6sA* z8!tH{AYCvX24MHS-nO%K;J|Iaq~??)SsLVzh#Nre4``Oc4ZvT)v>u!oEX46*8oa6o z-xm;tvb^hs=~?p2Xvs}aO79>y5l|o-DIsJ!7$ZSYi3V9&nbHhcXI@Go+CJ>DJlno& zKiI9vt@(CxOCVGkdBfPPDnAt_U7E-iL4TGcxJv>25IhWWv99{6?qvM90xTCD4?l}$ zc^_MjTPPZ(rfxACWa-PiiF87GXnsT!PN2a8>7e*e0hogQ7NBs-5dZ6ch_n9&psaZ1|@;1s1@9dbroF0A(erPOzi$#XEv%02cjOZ1&t!XK{z>=1PmEFi5n5M39* zy7LXh-74DF70`NkM-s0_TA2kstITFWQXPL4GusqVi|R+)x#R4;E3^SSq*UBnbv13RI)l+8<~kY>scL_1P#($K#D$HnDMY| zK`h{cf=`=C{jiZWvgQ%s9j&-U`2#*(%#6c@Y>~e0_W1Vew=c}ci5AARl4jNtOUNot zO_wqQtJkK>+0x-+(mGtqmSHW+boqg?e{ww9SJw=GaXhw9mc+1ZhC5V>-quyHB3}io{341r z&-vb>VJ`BAE<&p=(r`q` zDs+lhFAbv}Eb$Fg=Hn!7^NGDH8$pNimB0RA?-}%7XuJA5bR7h@2ZZDI31M-ord)TM zO9Ga>KHGu1sfZu9<;o?xi`gH`H~yH`Fi`Nj*!PS7_rB)?(_emMg|33q(FAykJB$SR zmE-Swy<~XcBzutp+W4gcLRBn%agMpGY5AHGv6D6(5dmk(MXK<9^$Dp^P<4}beEdW+ zb_ne}ViLZiIMFuy*jPXOQ z$QF9irz`B~jA=^L#oSR~TI8Mb5?GtY6$B z{D+l3tr^HnzTbGR2FH04S<=~0&zSOEOx0d0j-C#%IL|57=u#~=arWZ_6*kCUK!e+u zU!352-5bT~z7(fc-1)8+wk{Ue4E;AK d*@+z#7UKAa{~Ep#w~D;+oORw>wCc5${{_2<x2k0jb@U zqom|~z`?m+{{Q>`|Af_QS;Oz`zyHP7KR>N$|4tvre>y%s4Mcx}htLG;Yfk=V4pZOU z;p(kBx_TRqq28ubQ17Bs#GCi6LCGmGYOD9lgE?moX+xNAX--8nge6LEF{diZVh%Yq zQ4v+V=S5A-<6ReZv4Ho2V7Ij9;(g3TyV+z}dFaZ>>$tKP;j0mQQQYbF-6-<6I)hMn z{Vvs<{b;2n;3^RPD;}}d)9$eeR+VZUCODp4=y>YFn6TJ5FjIEOCOj^rR8LR!gQ75z zqxZClk@A^)fwithuk9h1{$B*{d;~pIMG@m$DrF&6ifgHrmhN$k zR!Xg1Lk@&>OA|%xQ|V?_35QM5A*IONjqbTyWR{Jtp?T5W+ z^)*kt>h8D$cXcSk;l^rr+w1NQLqCY4Rc{z}w>#_YVSJxs-;z%~p(EkOZsef>b+P(N zAHA)2dS0;6>4!lN&67fRB-|G++DC|2-m?{5;_AW=N02BcWC;??)4yXWCpa#qXb%5$E01aRfSOQJ~rvWNk z23CMGz**oa;2iKF;A!B)fDL>E_$}bK0c!UQ@KNAnz{i1408QXo;5pzt&;r`P>N`+f ztjtK6SQG1SLuv439u)AsffX8$zAoU)jG?mP;ywPg+miy)rb=I$6qCi++9{>FSW63O zQ9S>)F|p#ZxRhEubF?$}lqv^j4Gr}w<8L!<0^&=msPoCRbX4PuQ1?mc%ca1~-+OHD zBn8w#0snZM0*(`e&~BZmP2Z^hZ^!FXUh`8ov!$z_fB7?=*Kgdq@zN{TIxk;){l?X6 zxBU9YHO>F-FE!1vls?H(kkneOfg8o1>5@^CV?JG5la-b=IoA6NN!PS!}gbK?@wv z%JnaDN*im_Us7tdh}b+CC3CG7)U@kIei%3vsA^|(3oH?@UrLr*EnmY&z|`$H^U4ybOgA8xw+)}|YGw_A~)cn(L`<$S@dU2mM} z@qU~b=v5}h1+R56u`oMV4o1Vwpea+?o=Ehy_Bw6brWd;#iLuuHo#&~hV%msO(u+m| zFDvw2Vx^>@<{2d)d%`|G#^F>ya0i|PiUXU|Vy2|(cikA9lNq<=$V;lIhD}vvAXc_P z+cd42A8Cu|H@W`nhQ*Gn^I3(J`CR8hd*h|67mwMGsrk47{=W?~ZF&1X>~y%Bl)<6H zQQXGdlaeQ8C@Gn-y`~O^Ci?@rjRi3+y-_21W+tijmLK@hwkPa8Ki;-6gOLpEE`;y} zJ9+jqfvx(p!)|vZ!Q1xUw%_+`h7 zEISplf*s-A_0iNRV$mIpkj))ay{y;`2haoIl=Cfs3GJyg7DnmJ!oGC2uqwx*EJ|Va zkeWUWlc&s4{`e40s;*9G<{kN-d=|OUzu-~k6`kvx@%JsxA0qWj1%toG2`2wvg#v%T z2}S-31&jY22I%L!#Q#U3%>SFu@t-MF_)j^Z%73Cz<3HwvdHy4XI{zUjEbt#FH28mU z!Xp1qg(d!dPB_KCr*NA82PZ7^`wA=kyPR-_e}|tjzEe9_(`#%6YiO_~*vYfZdXvA; z-+c8wc?ApXl8Z6>6_GFjW$XR|i5S)n&K;~B96dOKWbNMO6LJBKn8PxdD=3GncWz>) z2Dx-+v}JPt;JmJAunOj~_LMd$z!sp4zRqa7rd$)Sm~i9Rou!mVzpGlRzNoco$*5JV z9~h~Y7FJ+e4>{PnaPvATNl_9bMV|7JBu%<$W_lkoSAcsH!x1qdpgk|+!7wYrvGE6^ zL9>{#amEIYvDxz=1d&sS2SddZv66~TrhI>qZl*8?w zm6JtO<;HmJQ8^s)50snZ8?dVAKdGSy*z;M9FThoXHDzB$Z~`}2rO2)}&6~;MYu+Hd z3od}Jd0kMz7*uiU<=KZFgt5H^zfd51@l{U_d;}6zXEn(~M1qDE4qsIot zK`Bf5ibgIJA|s9*1BYuM$AsW>#2y3{@ zJS4}&Ouk;W@w>h4M>e^`v{*X^-A1-+_hBULaLa~y+Jq+9@wOY=t`PQU2>Q0y&)Wkq zoj>gRsP4JlZF_TU!)~8feM;I{YUB;5Hi+EoQ7^d}fWd-lJvuRHi0wwyxUCxB7uX7= zc{g*_vn1_MX1Y?k zV(W@%U3?eeQd3JS&7<7Bm(n4NK98QuT&g;me$hxA;sMDbTjcyK3PCSKA$eo+U^YRym;sNSv%1{n?_tu8)6Zm#PVb@H86T* zvXm|!EX0k2#dHbUvV^{=?cW@C@zou`KOA@H<0Wy5$WEM28+b0ii~9w-Q-LrcP}F3?Pi4@Jg?EF!z0fz_WWJ{ghY{9 zC;;RVMQ8EoK{~!vB39X92mH@8NCJO{-+|Fn;@s3_3$^41JucwUL9d+C#)Q#sC~H~cA;-q zkeIcTrz_?-O`F&a^?xWz$?u&Qp`!?MFam3K!yrZ+WqbQ>H|~$^ zcrR3R8lBIHQhCgmX3W}7ir3W^+ezIPA=s1*p>opOAJ`Szs#`SU<0qQ31IXYZldur^ zi8k2-qaC|ax%(hu8gaHUZ0oVjkX>~6cYq|`eZX4x-=o{1ZG8B2Jp@58qit6TrwTUh zMBbedU6w>i?PQ!Nn`n`-a`F_J*T~MdN7#@7q{;AOPm-tF_UEQSDPjo31YPMN#6q>m z&RDy7N}Xb+yTLdszwY)&p0dYB!kHD+-b8YphAVlPlPA%G656at#{gTL=>tEItEeKE z>G4^rjBBUictg%p#zmE}CvhKzm;!NbW{MHKHRT~RPdSYbo~~2P1Ex-{5>A=s-yz|S zoJyWI{i@D(rbD`)nXn<=zTBd5f1e(IgoiUXZ9-v~}Y;{~xCJB>%PJgRdvbIQw1RWlbDmu9Y% zet9~>$F-Zh&*(ZH25!bIy}_P_(W~gVKhx`aomW@|=5I-F=oQ`4Yy7lIA=|(OFbeQ?s?o*1q-^_#@49Ch*kk2E-sZHE=F`PRxyQwCZMe@s`7NYA>A=A*uMg(91X?*A+} V?MMF#mWpe`r>%?Df>o`o{2$!lr||#) literal 0 HcmV?d00001 diff --git a/open_lm/__pycache__/file_utils.cpython-310.pyc b/open_lm/__pycache__/file_utils.cpython-310.pyc index 42ce8f65dfdbec65fb7321b2d08fa7ffeca9bcb3..e123ebdda63fcb774c69575e81e6600ea52b2017 100644 GIT binary patch delta 2148 zcmah~ZERCj7~W}V_t6bH)@|L`=-9e-6^bx{m5(hT(h)vo5Qv>vF6*6K*7lZjJ6MNO zz73*icpZtTuniTFAt4RF2!H&+4?ljXiMbdP6Ql8miT^Zeg6BCGvoUArkEic>&-1?L zocB5Bz2nBX;nK^>N(#j9?q6Tra@}{ZDE;ghW`26vc~fAhu$#5e(ZXYMTLdRqXqFh+ zRBuc@sPZhY6fRx`XeM7#FIz+>ik4R|0&xak0RLphS4FE>8@Y@7tJ}bE{FiaQxWgGn zSUatm>7ieWPqKDOm8@bb>3Yd$Y;FFatAx3HU{nA)>1=6@b2ZE^`nYs6+e&|we##t)NR=Dc`9e_^21_5H>)v&t&YXDD^ZDuFiF4i2J1Fl&f(GuzwJrmIfQt@@1>paN# zYWiBRc2dQxQT8n5{9gKU)+N?K3HJ_{l?J)G>2vo6)|am;ciWgsfr^l~4=IZ_s#%%I zbSmkznu_y!N>tQ(;s{bDBdKPTxROz#bg^QL9iR@+Ue69OBi63RpPh4%4a$3#vjO_m zbBygEz0&JC2=^htLONf$o(<9cO2st@2i6uVbyfM{EC=W@tmI!H-bCcZ2h^nC7i?$9s9j zNnL|wA3}mmrNn%N9xC!GYD@zb^BU9(eeyev{WoLkc=5hGM!M3 zD4p|p+s+A&S#n6{3G1LyPjVoU(F7p|m11edFvRJhovagogTD2J*b;I!MC#AN3tgfv zBcrB`n3~pm_gOnWPg=u)!gs}FgeU2ThDvsres5@G7pQjLjc^i??*U{ZqO$0j?`G#s0)|%of6(d5&tXIa(W@c>}C

8i-*xD=09W!f+P2ne)LGA*?))g$br1J6Z6Muk8FShPi5*Tt6xP7PJPZ#~NSzmSh)2)O$J)*H`vKlL}8pM~SiN7Gnm6~1o}_lYq6WKZ(llwBIi zJtIcJ+1AA@N7&ndGXU8|ppqz#p3a&@aaB#j;scl-0>~~U`_(lJX9ETWm^1e4T8hUo z-XAWG{05-j<)%QJdL zNyyeA`b59V4bg#*FKc2=^x?9-wNgzfzg$S_D(f;zjqS8~ROAu&BY<>^E&HhT*NQ(| zWJ_=nzb#C@5wHs&%e7QkrcF0^84SCTHTV-`3pYFWfYM2!aIN@?uMhijemGhH_$YIH zcX=4F46qWg79hWs+hOel^Z^bC5Xm!~-^6gD{y7Y#P*O59GSktQ(3K1y#?%M^hvW29 G_})LB{`5Hj delta 2148 zcmah~U2IfE6z=SHw?Es$mTuc^X}hIt*{*d{TT<9!fuK+tCD5)k-r2*<TuV?`=mq*~gm+@782U|+sf}x6~V7MM;oGMu9 zZbw*@*39$KuLXx$l#+!Vtb^Vz{G6@L?)4NhPaPPgfYo%OsM5U#W*1#8+QhcdUqzp> z^>n?sx@imC)qqu!3)m@f6@`a?D%Bwuv=I7J_S)bZWY~*+_R9!^!wM&J7-B zd<}gqSUae6{yDal#)5wOa{d{%igfQbk1YeOXXp#>2DUp}UgCAI0Sc8y{JSBUzfsLd zCCja(Qfe~JYe_Gy@~H?Kkc@ zM^n{@M7qa+0DQRV{ianndJzfK15+SZ@KQ0Ivh& zk`H8Hl5Q+G4oR|J6}G9q`|)E`}FGruZzKZSAPV79J|K z@LdAsq@Yx<lwsAk&aA0TQP@G=0c(OZ%> z(uyvh4)IP5soWGV@S$akS&(Ly?W&U1l;xKT$-2tC%uxMu+B731M1M3wuGO{=*?(dD zIXX9opWvThwj1yqK&ES%uq=mR@?scHGh^~5azq>43Mi{760H(n@vdlKtO1S?03T(J z?=Ei#EC+M|)&k_WavQ820CX8Q1qksp=O1EtH~-@p%0kJKodeG`pc_h>Ph#pK0EgrB HbM(f)BB1#} diff --git a/open_lm/__pycache__/logger.cpython-310.pyc b/open_lm/__pycache__/logger.cpython-310.pyc index 165099e3e2f3db18abcbabb994b2e4be4831a4d4..661eddbcfdf7bb5aec60f898d0197d1103156fcd 100644 GIT binary patch delta 20 acmZo>Yi8rl=jG*M0D`^0&urxW!2|#?Mg{8t delta 20 acmZo>Yi8rl=jG*M0D>u{4I8_5Cv!e diff --git a/open_lm/__pycache__/main.cpython-310.pyc b/open_lm/__pycache__/main.cpython-310.pyc index b5a04d44d4538e87757cc8a758603f0cc5b0bb82..156163f955e4cb3f6479e0a526df2c981d7c9cdc 100644 GIT binary patch delta 22 ccmaE}neokLM(%uGUM>b8xOe^QM(&4^0AD!>e*gdg delta 22 ccmaE}neokLM(%uGUM>b8n6P~QM(&4^09r{0s{jB1 diff --git a/open_lm/__pycache__/main2.cpython-310.pyc b/open_lm/__pycache__/main2.cpython-310.pyc deleted file mode 100644 index 47c485d1e84eb0da99dfdff3bcf9b759eafd51e5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 23056 zcmb_^X_Oq-bzWC>^*TKZX2$>o3OgE0W8(sV-~wVHK@uQH0EAc~)$Zw6J<~J2;Z-%j zDOF3PhLWHK;z*)4ldxxG$1)vTvgO$E?!>a}#ED~Dab&CFKZ?(ZqvSY8juS5wCFHyJ zRrmA^K*v5OGpAp@eYx+xyS%!$>by<3{i)LbT#7{goD$uCNhBV|&-t<*iHL~M zDiKRPHA~~PZt47tT2X!)mch@M6~j}n#H(>DuBkQ&E5T(+D}`sYlCJhxJz-hK%5Yh) z)yvO5tB;@kRzIFbrN2604RGC9WleR^8VuVFRfnw+j>jvb)iG;~2~H(DF3re$)SWMxxzv$dJyDYV^UZQ*#jvbDO++Q#vo%J%9GYX`?O zm7UeBmDM6%oM*Pn+I2Zn>oFqs?zzus5&9RJwd=V^O}iHHdbrM>>RuiB)tr^nBAwbl z-Km{ZwNbYhqwcfz@ko7$@3;1IyuWgwdeAz^@d3>8kaehfw{jcNg zDkrO_tW(v~*6He_)}z(OtjDTntTWZet;efRSWi@+w4Ox0aqFzOPfUn)uV~gev0mKu zO2m3fY_QLlbg@yGuNcC-x)uE^$B{M9t^KA#pdJmi=yV&#bI97NKO%7VC1EL3cXRK_V(8x=cW zoNm@;vx#8*bhA>KyD(jl!WJhBZsB}E7Ah6H67-!dzi5l|(k_;ra=jJ|JW?pmUPC=( zm29^-9Sm@@a&78@D=~qoxnPKTyWk>mvRrge)N7^kRM6YWxH#9agY=2>m!2w>D)oZE zeAc6B%_-Go)t31})2-*P$wDLFs8`CxIhuV#){A+!K5N&Iiak=P7iSS!dqUP7=Tyy= z^~T(jh+Zzc(-uWg*n74v?8@=0wk%AkMHi)wIpwRAj|#(Kx>zYVPPx>{9}Dwo3JqK2 zr`tt(kessJJiuHC`YCozIGl29HyaI7WhY-PIIb;&v2v}9ie<-@<;kXtPK156T(pCM zNr@E~Q7d1coUw~8O%%L8dru7iaVKix_M`= zR?HJ%ZN~{VNV{4`OU}FoFrsp!UdvBnyX&P=u!h!KwcP@(K3^(V>|mI(Xoa<@Jho0v zBN)Ou^0j6axP>7b^>Pg(jshsva*d|eJ*wBwQ_ojW;JCSl`<-3DG;&^oo>-T!_4>oy*38m|K^{X{)DNs5J|fykpxU&})q# z4!jffYG6=KR+l-95E#gueKR6yxVLuiyH=O8PNPt?_s!Ltj$@bh)f;v#U#WJURqVvU zTw^ZiQQdL>Ptc6x2>P@EZBUa35iM#6+J6+Ma$ed5B0>}TE4)Av^&^Y=Qe;U#iPL@| zk~O>|7i`;{cHPF&efzFmyOt}~1pq%?m$ELAsP3Bt;P(}<%JNm)*|&fHf&B*#=ce6i zWyPRvcwWGGj)`C@;zwS{v@~B^j4VZD#zkyit0DZPenr2apa3*XlpRr?D!V}xJBqZP z+aG8S!$<`O&{>c^Rso7(I*kWugukZY%!s!UTi*ml$obeF_1%}P3ZC%9)ZyS1zfT87_vL?ejW+H z7%*SbTAHiP=!<~)lJ>mb(pynqUo_mPR2*~}XqSzjvvlW>msS|Q3&@QV8pZm(m~#kw_?=t$%cg@gA4Z3cTo5lg zPQxw+MzJn!E7g=0(7b%f35=;qeKOGNj(mXEHiKgo#GSeXY2oE&V_agzK!cn^mR@qK zl&sg?JV7Do={k$6fjB~^SRKHwBl|3darzLXfxN?-p=GoQ&Cumz$XWr^C^9e>aXtB|JQ?oKv@C8C3m$*@kq02A?X%XNyl1^0+2aRu5?AvBaD8P8R8Idw!elIqh zM$ues8|K6e*4IDHxc{c^B~(wjb8lkzDt7J7sJw?p&}IYe>S|b~kI@tu`%9?A*uOdA zZB~Sms2`Pg9e_@C?B6Ep)~MMpx`9z9CK5wfb>zdy$H6dag(}$Q@=gZ2BUVqKZ{o_HeR;;FrWGm(pd5#g`7Ss;C zqJ13C6rMP1i!qTFpn9#O8+Q|aVlf$_N)J#aB{D#jOQKiw0adt+Q6(YzZ?BE{&4_{3 zH8Ib$bU^BQ`~=OS){mTwZ28(uZznIdTu$>9YX}Zl)1V(mICPWeUmF+0K%ICi$t$5A zqs)+~#mJpn_;DZ9qZ9>g7(FM?0Mznv3W!gDC6nxxccz<=iNGYmjl6>wr^}AHyy51x zf@3n*w}nac!Yr7I-a3c(n4=v}@Mtt~qIAGLS?>vsj+2*g5Z&I}gb1McsC|)pX&i4G{1m9mK^Q#-er5 zx*Zd&#O7upjKv<)sncZV0Lw}R6Cf^bHf!zjoP3gY#(+TAV5%a|BK?jeMV_P0e2Rke z6g)!#Nf0U^?kF!Ic$p$ZUcC(@nklLCP9z~nhY9qVb+F|agZ zYM`GT8kX*KlnWpa1qlGV0(s^g=QxjEnRso8c;gQ|xQr=X$hgBx?J##T{WagxvTy1N zMj#v!Zxn54aX^CZ7TG>UBZwi31yq1+3iS1Z+nC@7e`j85dcD0q&7hYP1y;&;qcf|&Au$a08tD>pPqFK=N5bsdsC`ao%k-e&Z)7QbagKWLXEPtN4h4go59S zNK-=^0?o*Wyqf@? zI(S=n*9uqpY}pm(PNz@)5URi9xs$|MFaZ-IO+HHPS}(PUpsgJ_ahYd}8uwE@!Cr zEP`xYHmDPHs;1=d0g*0MAPQr}2~M|58tNb}!|*lKb%+`f`X}`yka@g=>;pi5lMjRR zP^Cx$WBZR}4id{4!G9VNpb1nwLQ}{A1_VZ9UXz0^kXJz7fLIx;C6Gd`b{aBAjL;E> zfRO*FA0yEcaz$juko!rF_cdSl<05_|id-F1J%mmuk4d0BqaP#?YWajna!+R=i6qcV zaxE*FgqrOiw@E1p2An*7E{H1$KS=XcOWDH>#xi#gQk~pvR9?aIBrQG|;Owq#@kY84 zq|*pOKHu4RTTp!Y_%r9uoO|>rv#}2M?$pg?(~Kbs7f8ZYSW~3|jnfxSp0~zV4@>$5 z<4=vMFQY|*(LznQMMdvNU=25LMNo6gpoYtIT~X9%{dOGbW7O-+?@Q>SgCc!eACQFp zt_9Mx1mIqHiKewV-2kZQBUR;Sk42C~IIxoqAIi?e@JHbS&Ghd9r&?a1R1@ z=hhXoB*A1EL@ce5q(qk|%o$1kA%2%LpRxBc%b}bV#Y_}P5G$1>#4?^s?l!nY0T-48 z1`uIg-hkc}QM&t^nxdb?wZNd2;*wv*#N>S_{ThDE@dqN_R^B$g=i&RoZu;2K*tz3u zVwo;2BTo>6REjG%Qptk>>IWBGxEZUJOSt91B@0m;MP_JX%koh$uzKWQK{>0TbcX^e zqU)M*YrP>qhWx+44}ahN&0@cbakvg4UFS2dyadb$4 zGH=9M!{Ho0i#ZH;&EZEdhvBX{`~>DO+%<IAl-`*2K^z#`re4P zMtr>BK-o9^-kA&}k-pX_#r#po2ECZkh(8MM-+xm-7P;~@UP~Vo$iMIt_(f*=#lWlq z7>;mB|7Zkhx(FZfN6^>q)|lVtd>8cwYC6I-t#Nm7W@vHPANR*z2V~ImnwwGV)LM50 zYFj&R$X|<+;hQ-5ApWC^W0=JlD8w&8@MpPmH2^5F_Kb=m~qC z;L^4J#FBAicQLu+ntf zxE~j}mAS0J+Mu+6z)JfV46k%?*+I9jvRriJ2dH$Of;4N)*jAK;}#@n#<+BL#5euB(F?4@_wpMoJsaj@Dc*6ze`GS@sdDh zIQR6WC!fq;xOn{Hsr<<^CoVEzbP?GkQ zrd=z}1^vr3THTb5mI}Dq7G3O(^W{nO%r|WE6b&3};MTh>e^H;-1l}pwP)eX!TrK08 zxT^<@uI}BT_f+dw?d-ZcGS_~(H=;g-x=EM3g}lYD(&!5mEbl#SyZj^ta|m#C$P>3x zKq)0wSUp|L-s<6%s~0NPNY}DSkW@;jHMBY#9o$}*H#|sUp(VT;a#%_X;wovW9idCd z@TQXBv5MgY{XCcD!F~q|aARN`@(queFDlRtxi`$^%^pjeAQx^n@?HnTrtfAPH zv4l0{*CD+_49-Fma(B*T7q=n33%_-k7la1X*zNAQPCK|)4Ba5Mxfb-yTM5+{SbbuN;J%yF`%z~%v$(Z>Y(aC@0!m|)hdmotcf3d)z!;`` zaOO}=M|d|{-{am(weY)V9wUmih~0M;%&~Egm{4)lIxJ`{D*t#_{*m@v#Co2~(N6j< zPTx=Kn6K>+wQ@|<6@(q0!&SaEubW|6z6U7)6042e*(3?$rzIY#-KG; zxdZu+AphI$BLowoHm#m;R=V7aQZ;|X-4jTi?5rZhnNFHlMev3>2$J$}#PqeKPW$7asBkNZ7nJ0Z5-far|a6Ji?#J6%44ah??0c^ty$ z3FN)NW7GQ10t!3v1h$GSp6BqS*%Re3Z?ZOCOLJ|)II11O(R~W(^8)9K;R1T3s0DiW z+^3m3_11C_?a!j*8RUKl&vAN+JvD;F*(L2&*xev@0lBZa7sED&dr8Gsr~;`2kpGC(OLxq@m7jiDI2UE#Y?76V)+Z3k~P&&{|jQuS@ zpLX*(tS(0-IJvkPAu5MnxGwjC9$I~B-Ke#XR*t8I9{!i#yZCO}38W_$4B3z8_(D|v zEm}~V=_u0sQSbSs2+qq0RHuO!(Vw;M3;vpUZRR~V!%;b*Qqx+iAGoS7M6bks*e^Ay zY#;0%_E8JWlc>jtnkV3Y;6kJpb$^IjxrLcYw}>_eai0U~-Xf|l(E2H~KE&d=4K7l` zZ;GDcZrt1q)do>(8np_jbaa4HxSz;x849_!^#7dV+U77;=dk(l_1uOam&< zL&pfJRP!g?D>D)~4|5K+`3iCzm4o>l_lIdu+L$RGA&Ts}es4*G`lS0qZWBFS6(??B zXJ?3Ky(mt;0R`;GwNGfRUi5GZD>{9r6^TbtnJbo;KHr~v8XMCEGc*57Gw6f-psL@fk#TaVHb?*CoI!`T7BI4LP!u;JnEZRUX z4Y}KWKX_XouU64(11E@R-IG|s+0`omzMNCkkLtKD`MvXa>qZoT+dRdw75q+|2YnrN zU-n1E)ARbPd!e#@z@4XaNf$dWY=Bz$3T7~l8Ls6%p7F;q$^}vbZyMB}KeqV6?uEiysjfocVuk)?-UF-iT7c>NO*PoA4d24;`vnc&=2$Mg=t~^V- zLwOzg_95@wP^YN1!Tkj8OYfT30qc#ejjE2nk?OQee*au~%x``#0-fu&I7;Zj@t z5UgcB!86BsRhN0-{YGmuep@h_!pWEf7R+Wlq;b-VwbZh%mq_sm#(%AlXem%0&{p`%={2ihw#2dO#eCqQQ znYals!6@!-F>KT0Y5o@I%dVvAwfce!k_Q%13+g}7YL=b_AJ%2@oO-wTPfH$1uR z(iz^5zL_7xP7bs-flGeTM~ZfG=Es+`c0B*%3-K$&)fKekGVA^`IsyJxe;Z*UptuWA zj7tY~)Goy3gNTQ4N33rm(%M~1`B^cGzJFroOUT2Hw1}Jkq^JN&artidO&_u{L-eP7 zD*5O5{q%giwW~JH^{QNN#{HS)dh=0~rhq*)s)4g#@5rpX7UJ~}Aa~c|mqGa(vr$Sb zxd|Bf70enmYvwB$8T$Yk6HoUDLpAPv4m)*)ck0OE&!Y4gphvY2qt>@V*^6R_!}?!A zeaZC?E*=mLL;di~&!NRv@go}Y^HYG6qFuth0ZU@!9YAk?;2xg&7r08B5KZP2hZldr zALaJc(^cLfZ6V>j2laO>#GSw7eZ9stLRm}6KDY)a%KIXh5EcDJYJqw04R<{uN8Equ z{u1r#;x8+!3+y?yne#V+Ucv?nRHd{3;6%TIJ-6#k?&badfi_I22j1s|{2t=Jh2Fk|{eFq}`)kN~m%o|n;P>13 z(P<}X>v!;55HH_IJ|F#v);iRpIumI7ZSes`*QlPNX|226-(CDS{@vH{S|8l--+ovF zU7km+q<^=PXMh3U#+a|rnCcXTGx{HtdLVM0^hVA7z3aI6_7C`n(D%OsXZhfbIF;k# zc3tb9aD~ZG=Km0_P=H5~?Tg}zJi{bzHhy>U-?K#j`-B1hKEDrruA}qr?;;b?IWH5dVKFf(mBHr{s`^Uf>wK3L|cc| znus#Aj`;V9BwGB3@rZu}R6By+uVasr&UK-q=0TCXFVZ^LSK_%JpN0IJlA3=2_5VQKXJF(XAsmsK zcckk~pa!3bqn(+2a~b>5`sf|;|L8x6|4HYMy73>D_E{t1{^3Gu4pM#Ve%S1>(wu(yr3#yCLKs zRPYh6;xz+l*9!Mj(BQy{{mmWOYoouJc)fp2EchGgtN_b6$Ni}L$J9RBCu$25|_tXAC;M-^0xQqDbl>WxJ^Sd3W z{|;mS=V^cNQfEIX0?A&qzCY>I`4ji>;U#B?wnZOu>?K9j>cdz5^WA=@6xcHr`;bo;;E?zvX-UAFh%62p6B$Y3R+CM>gIpVfp3T%p#n z5+nOqT#@JGr*LEGT|QGX8)UtYR}FjjPM7ww&CecX$OaqZI+v;<$G)-`2AuSY7WQVa zauaNR(Kbihhxa1(Ubji6X*uus9XsYJ?cGb3NOvBi9SlhY2LZFw0&=Bd%ZQ=p385m=C_Hx+aAkrJDcBj!6JKByOOt}6`8m6 z$-EU;7P~MhhLtod1Q1E`TZVjn)-&*`d9Rm1n8UjfujkZ@c$WgdkeX|Hhs{C-ZV+P5 zR4*lR<|Wws?cGb(r34x^!#T5H*6eGj({7myV%M-ClR*@IAwdt!0APJQS;zZv`6zb6 z8eG24$15g&7a@zZMVn98o7hM--}cM#<;o^30s}4QZ97+g=Q<_sf?gzzDK`a^Zoy5b zStQ?uQnQlthMr;n1y!KN&&eg4nNeySxYzse8J92*ek}DWAVH>SSS{H)9zT2Dtdf(< zUc9SNUM|=jSbG)Nr1JFn1NY`k3Nc=z0h4P$#)d&*sVRZ*(uCm*wp1A;E1YE<+GE-b zOMq)`8PRgy1J5*TZ2ou$3{X~`)n>&l<4uqm`exWp&OFKPBS&E@lsm6HHq4#ME)99f zJ+LVN5|~~J$HuL@6y4q)C$to$GqT)gyGJuH$f?6SvU~#hH8cv0=t#n`B_4W`%cnl1SQ7vq`KRQ<2LKV)UhcWb}9|| z;Cl4^*JL82zcd>0#>kSj%c!kg>n&}Mm+77@*^@o&>A6E*)O_?2@1CcKOewOpcY1D8 zmT>~)%8b4BgD^#@H!FhaneFDRHRK~iE@AZYlYPRhHEgW{jKix)O^vIMd4BRrO}A zsg$d5#=<$wPvO0!Egf&-c!{>?pR{B0XE0gb8Tn<3{VWCa25{MaMSc~@!`KS>^N4sU zkbN@Ho;AImt^vIaVBIOg3q8C*FBkEGuh5vb(w${GUe8$~`sU7Q;P^Wt5*t)8BBcqD zVB-Q`2C{Hjg--C;>7|ohhTRyACFECV?(ejU(^!S05NdtI>s`ic9C|{O<1jex8s8bjV zf8Tv*1w`pslBMksevXNNZK_FyVc!s+E^iuR7* z7j}^KcZIOyFe2Xgszcgw1@Rty0ygP8$lQ3xN_x4>HTRhCDg-m*{ah)>mFh^c!6=}sL!5Ez&YBwA!3bx|qm|l=18VUkf zk7BM>h=VNRp<2GpJ3^ZrE5$1%yHG3b%$6H@#>5~77E_nr;0aiLl*u6@JP)~?H@ch` zqL*}FKx4(Yr4?5Wp;nB&LA{|%wb@$z8aOZTgS?*P9@E>A!(oCHRv-bJ>_Ff+Y8Ipo zmc-tqd+>yelan_93~$|em<-^skk|t{X?96>Id5cDA&Y@Ium1u$kxd0t@-H zvN4Vo1Eg%nGGJ2Vuw5!moLqr%Y@#HWj-7_ZgM&GDh(y@+2FQZERO-ErBE+;qRU>^gm;6W^=j^5G1 zIOk;;0H*SQ@E%xg=g6X9I}_cr!?mD20GV>(gbG$`zgVn5tbs|84%bu?`$E$nGVb|@ z&oRggiP!(|N#2eF#V7f;zUY)Ov4hEnI%RQUAJ-$Z-RFyYhry;gT`ywe$Wa?!$6eN7 zUFNZBG95aS#przIh9lmOom6I^By@8Z!~jE0kmv(D8x@M=&JE&G0o?<|2>p3Naf1%t zGSLK+BHPwAI06DP-B@$rMd;R#pTloDa!HkWg?wNM2k6 zNe&VsahFiiWv}`I`K`)d2dKOhAr$#rn_eGe?Y1uyR+CM!C6=BmL+FAF`^?+*A(ar%Qgha0vRF zlFEq7{lB@Jd8)?JPa97_kU;dHo^`3QIq$C9@>Zy}Yi}W@eUAY>I-Fft^{Q%^0jCN$yaF9XQMuTo52Djbl~Eu9glR1b=CYTFpvL z!c=m)T9C8YQ&Mf;W1x<&k5o|v|4!48pJI^ehNZ* zS0$#xG*@dR?19K~SB24bqgDCI_V*a9G-9E;8Kj@3({YM+j?GG~UKEl>nCI&lWhuu- zqBenqBn|d%Ais9?IRkR^t4fj)tSI~DQ>49knNoh|3WpJ*(Nd2XjRks9`LAnhI;!5$c;8BO0 zj6A7L{KW)z5|ksEs2;e4euNdHH#o%>aTRRch9X zpqv#4pXMOx3375DO*6@u3K2mjIoXyWtps00;4t=a4Z56wfO+8FAPV1IOX**9CLL>_ ziVH9~M|v)n4C-(7a1*7+hwW7&Uq5J#s z>mWo)64vRXG}hQjU{m;I6%s7YhXslO4`K@JX9gg-GBD_C9r83X1LG<_tP&((AYO1$ z5>s54d@cdXj*4iFuzCJ+22aJ7ze)W*O~Efw@B#(wpZXDs)hJ-2Y;sPKpFt30a0f>e zgvk;N-$~{iP{_iH2k#s^gjuoUqENkT>DPo(JPxSGLHTRcTa1EH1UT}zJz;kmXBPKt zC0F)S_Gou@8BFJDSwP0LGW^{W+<8LBu!BSxk=LkAvT#-D(Cl%As|P@mRt`m_hmHc% zVo2~ATsn8;7kL*ojpNu>n+>ZUMMdDVfc}vOBe60{fC7uaS8C)2)K!-LuTiVs3(YF{ z$lPTXcF5tk>{^2?^pl-4sSbQ*tR9u86bRfq;{}5J0(uA%AfyDpWlzVzfN17OlSVpD zRSXj0L1CL!=xqb5<#%h)16_td(xtDeP!xn-x2J1%s*tDNM2mvR$0D(%1%YbbQ07D7R6c>4E@5 z24%%Szd_?JQtsHwk!aKL$sX6S;w4ti|E*r)io{kFugXx8z9hsu`d_IpurJUx8_TR$na^26tC$RbtO$-2 zq^>FVL~?(SNQ*VtB^whN#_O5*;rM!5wvbyF|-~Y&$*=M55m!qR7$Ump9 z`fGJ{69G8U{hIt0;}Hjqh7^8? ziza$n&lu!Anv4!c@tvAm!_hRpS2GM3kqP`V_@$%de?I_!(^35T^f-L(4Na#K{OfPW zZ>3Y@kB=JKD8f-Vpo~WQwDIUTQskSu9zBw;zVVX&gSZh#Yk2Qtw6vZy26W`>`b$ap z`44M@)GJ0HrxPPOic!c-h1{Qwx8req6+rLKNyZ1`8JY`RqmwcE1_0{e>n!w(--_QF zj-gb~Ab&Hy9e_1roDJ*>lR=;4D5K+N+#0|N;ogBUd0eb|Kp#}S{UB|OqgTv3M$VJu z=3>Cj1r9Vc7XFjpC%2b&D*pXUd^6T-U|cl|%omP07@Nk&FRk@OsUQ6omVzhGSu_4# zPsaalSI<_oq*<)T>@t8P>g!t>hQmk`EJxuKu77{Stz?3}fH25AL@fr{ADGrky%J0? zySD)!)EU-IK%X4TH)FMcZ*%~o?1T%@IQK!Y2E+gBv8(iQ%oX8S+Q@G}rnPW-PjgF2vN+=}zbhRM^*@t=#lfADGa!B_nU2$Y2@H>H@L&_I@SfCQA;-gY0y<%#A2UPYIYg!$`6ZW)%JBlvg#rGe}yPD{VXIFHhmN zi#aC5{2F8m(#^cBEK=soQ=J#|ojZ5+UZ38pd)a^V+kJlj>J6X3uyh&J*8k|b?mJOH zO$iO8>arq=o4~l5m^XM5>;(;$8!Fq3Y#)3>r<-oMBL9Y({VfG2DPXPn6vendi(<4h ztSFJLSg7z<%b-D#bN~&C1#A)!!p~JK_h|LPOo7!^B~z>^HYI=Sg;u~>$}^SUa4f&( z_y+p=d#uYLZ5X@d|BdSWB?TnS zDi1-hI(P`OB%h;OoE#~Nq!K`rrOz7Xx-3c5MaO@q!c6#?!W;_@bPyoNuCAXW#Z1x( zkuOm|`X?)atoAAATyn!ziru<9x6je1GFg*V+DLDEWR{0m_r*(I_0dF_q>yaPLbr8S zJCpFl$*JpjAYh);k~KNH_y~H=ROupwvECX}vruiiu0X7qYW610m(+j02(Y%T9HdJo zbaJ>PQlI&he8<2#77OOqP`gz(a0y-b+Kcamy4N&?4<4vHo6bT*;f-(T*0g(DsYtPg zWOcyBLYh(n@PQEi#$+zMwZ-ddjBO3Jv-xDGfn!54+oWN5ckYzJ2fMd qjwa)Mv1CuWC%H2@lpNAW+B}7x>Ay-(ClitPxBru8(gUO5EB_Z|Izw0h diff --git a/open_lm/__pycache__/meters.cpython-310.pyc b/open_lm/__pycache__/meters.cpython-310.pyc index 3960ad9bf5039a492653c26ec07743231183e29b..81eb4a49fd09c9dd7bc8ab1c6b39491a7e5dbf13 100644 GIT binary patch delta 20 acmew?^;wELpO=@50SNZ~KC_WKgBJipp$2XM delta 20 acmew?^;wELpO=@50SG3QHf-e1-~|9Z)CF+> diff --git a/open_lm/__pycache__/model.cpython-310.pyc b/open_lm/__pycache__/model.cpython-310.pyc index 1c8b27d50b5dc612e64d2b6e12307a8fa2943de5..bc9767b8b8d78c36323d54eabedac63a01461365 100644 GIT binary patch delta 2689 zcmaJ?eQZ-z6z_fQx{bBl7#rJ4_9Y8jBO9O_Y!e}BL}PRqCw>&;^Vz<&Z*+a#eXr|= zo6|5sQ2faCi$xR=O+XVRP5cK1qrnIoqfuk>)M@-cjG&oDK~2>2yWK2iVw3*fx#ygF z?)kdsKAe6c?Kl1YIWGE5e*8xDI*uRCb zUZIvW5@Woij}3_*OX`^^N=tuchVab2rfz^tIo`BxL}NA|q6Oy!`xL$r^xfi;1@qW& zW_)g%RT|)23Mu2OKr_I&D4W+dCe7r9W2wXjJ_vT2!0`+kdYWH?buYjIsF1kBn$2x2 zaa^!Cdt4J3+aWJbYL=~T(1+DQEuPYy!jz?}VJ#ff`2&y_-mclVSDKU|KZT_ikQ8^%U)m_Ml#|E=$}=jUBM&mgNAvG$f11|8Jid!Q4qM}R zqIx{VcSC6pB;H&CNT%7kqok4{v8}R}O^AJ!-}^_Q<>&Wc$7Zp+YF=;)$a?|109ygl zp!D$>EMb~&%hpg#RrLgAAQRxq1U(5#dMY88oSmY-x{~cg6Nx?5kKfQoisWN18QByN z*KwzpUEnC`3n@4x!|@skHDdI0c;V!mDO;{K(2CDW%!7gP*8508d!L8GLx8;mj>j@6 zKQBP$&ZUMwCu5lq>lUo+$i;HbF!s~_Z|HBq-2zuJE6wBk<;^(0Ag!#F4LwexH5>st zJ}YfR<0;iPc{s+OCDwNy0v;gCe^iz34Vn=PHAr3+FS!t22F55x&drZ{U zH7=H!@C=zrE9C7-8&_Pt&hho?dPMCtd0OKU4mW3-Dp9wH^8RSuRn&i9)U_-{or0ch z3S@VMi4iL3H+5CxQEOICGZ)t%VeJ`Myb~}6kb&L>>RJFomf;0U`75AvU3Fous}4aT z%OYC~QDg@Ms=^NgUIWbZ*Td(%J%+o>1>g|?t}1_-fC_vN^bvqOQd)l#qQbN4`|qs| zi-q!gdlM>k5r2$y9XF_4!^-=GcJ5?UN1Q%?Rl~d33{9R7cn=^=uO})vhi?=&EM5}GZEhmA!we@ct_|y) z@0h-qnZ@y$W8n@x6dOfri&^$5>`nV4&3-0cY-wemi^-Oz=u0>vue-Z@O*c)r8=-m= z;ATLsnkp?Fcb{R?h>23hb^ZYqWRWL9eE~RUxsQr?YYjV^+16UC1ipqUCIi0=a5-QV z;3@I?(uwegATI{^2po4*xA}1_3kgDnnr$c4gx0TzN+hX;ZDUZg4Kp##u=D}d3TttV zhbp8-j-hJtWK84Vio?s^V<+Wnv-g57*7eV5b=*lane|Bec7l#X`EWeiXx5DvRc&jF zklXCV_-uOIZ*E)PATzFTm~gytGiun@zsBiX(`B3AzAXDR48zR-Uz<}R*UAVY_TL;@ZSEu~GW=DG^tS@oL)* zTHLn$7C#0t^&4#Plj7awl}q?sqMeessYO)FrrApmO4yd&=@ijp)2nNCitEFl3LP0fm<*I~sli zg;;pmufWp;l)ULQwRJ3AI7gvrgpTt`cZ@&_=L_=Cg;5fg(EBr0)8(14QojxhlfpXZy~dNG71_v_3#GiT2E z&Y3g2tGK;bnX9ZE>!ZK>&L53;zFB!OTrI=tw2?7%o^uR7AjRL|b{RpLuZERPX)5C>;;nA@ zLR|wRuC7ezYszD-GO0q|hiM4F#rpcrHpP;sQ3R?pCSZ*^s84)ezpiUP%_BknD*cme zkqjivbb%khqSv4ac?6Koa7?Kc@=|PRXl8rFp@ttShp?)WZ^n+N#JugFYnV0%$SV*kV!m#1kV`RI}1 zQb1DaFV31Rwc@Q59LSI%E33z>42KtTOs?RlS%zl(I%1x0+Ssv`G_>~!7VH2VB#;5y zB7TlS=l7`Lhg2+k#Ii{X=6SK)FpR^r|6BTOxLf9{V&j7Ru(}zSCl$2?s-QQKY7s|( zGHe&EM7p3mIUY^&H%RndfPlvdB1~2o`8?09u`)=Hc&RzonuTJj3KkD$pk}dI*4!Rd|tc{C$kQs=C#yst=%1 zX;GzxDBK2ttnib74*{d~wf&}3L%6$qC*TFZHo$QLO7MFap8}{OmGzIID!(Sb|KBQE zvUf~8p4F*}_%2#2{TO)#tL_)dxtmdvn|$K@)Wvss_g@wI4U6L#?faEJuK~=h8PAUm z8*T5f$;hnR-{8JmACcyfOp686x>-^5Pg`0KBQlW9@eJqKvK#P>_-b0?tZOrOhQz() z2*nx5o4U*6b=P6W^b4))U|>DqzobPfMiK<+8QlFQFH7!$(3}N~CL=kcw_3Fv#?O&F zLj8RB(;0RC8ZR$I85-|darNdqTR+2QtmI<=UjUTpl{AWsAF84|TW| zSo#RyQGl0Cou<;CvK(qL31ZyjUt)nu@;4Y=2HY^+r$xGBGCMuIrK4GE`W{Pv0Ne{$ z2ynZ&IMMlXRAPPyAWR_r3De;}Vp@)Ax#2ijJ!@o4S*?af)Uo;v$I4|#1h(0y+fgHJ za5+J(@C3S%&L<82lX!R58FodzHV0 zvL>1~Y}<v_XT%Ib8^i0QUNJ(nJmsH3VMr^rGv7Sp!+OPa1ZV@fRMj|H}BgMpH^x2zP11tpjPNHqOOTXE_v~BKc=9HlPNS+L@394bP?aT&+D98PEpuy ze26VfF;aXX>{cal58o`Fo7cwlLhM&5j0UM>b8xOe?)TI@#NIf9J0Ca)5V1CpFVvlx>$FA-{B6~D!roS#=* zQpE?Rbd&S*b5iol^EO*c@G~J_Yv8Jqv=Y-VAcuz925 z2~Nhe%~jS}oC--mYi@D+CFZ6U7bGU97I_2tMLr-R1VprLW^s*RVPu~i>zTm@Qd`tE I`Lw4e02`w>8~^|S delta 168 zcmbO-g>lLhM&5j0UM>b8kexd}EoLL{96`pY$*Tn8fF!5TEXIV*ON1I&#jE&|^Ye;J zbd&S*b5iol^KP+%`J1gJ_?a0SCwogzU~JiZM|wFoW8&r(^$J$Tw9S8YHnTAHZQf{j zf|D_2bCq=#r$PeInp>QHiMgr81&PV2MczPukq?Ln0TC^mSzIGn7}+MrdSu{4I8-|m;fuI1cm?r diff --git a/open_lm/__pycache__/scheduler.cpython-310.pyc b/open_lm/__pycache__/scheduler.cpython-310.pyc index 9651a64268c4d5ab6815528c6cf3ef49a63487b6..d4301f7893c35517c65085ea390d9a3622f3f473 100644 GIT binary patch delta 20 acmX@jcbbnopO=@50SNZ~KC_YAfgJ!ls|DBq delta 20 acmX@jcbbnopO=@50SKm-Hf-c}U BUFFER_MAX: + time.sleep(1) + continue + + if buffer_lock.locked(): + if len(queue) < QUEUE_MAX: + queue.append(chunk) + else: + time.sleep(1) + else: + if queue: + dump_queue_to_buffer() + with buffer_lock: + buffer.append(chunk) + + +def consumer(my_id, output_dir, threads, buffer, buffer_lock, num_consumers, upload_to_s3=False): + output_directory = f"{output_dir}/{CHUNK_SIZE - 1}-v1/{my_id}" + os.makedirs(output_directory, exist_ok=True) + shard_writer = ShardWriter(os.path.join(output_directory, "shard-%07d.tar"), maxcount=SHARD_SIZE) + + chunks = [] + + start_time = time.time() + + while any(t.is_alive() for t in threads): + time.sleep(SLEEP_TIME) + with buffer_lock: + lenb = len(buffer) + print("Length of buffer", lenb) + if lenb >= BUFFER_MIN: + while buffer and len(chunks) < SHARD_SIZE: + chunks.append(pop_random(buffer)) + + if len(chunks) == SHARD_SIZE: + print(f"I am {my_id} and I am writing a shard.", len(buffer)) + write_to_shard(chunks, shard_writer) + if upload_to_s3: + upload_to_s3_and_remove(shard_writer.fname) + # print("FNAME", shard_writer.fname) + chunks = [] + time_for_shard = time.time() - start_time + print("shards / s", num_consumers / time_for_shard) + print("tokens / s", num_consumers * SHARD_SIZE * CHUNK_SIZE / time_for_shard) + print( + "hours req for 1.2T tokens", + 1_200_000_000_000 / (num_consumers * SHARD_SIZE * CHUNK_SIZE / time_for_shard) / 3600, + ) + + start_time = time.time() + + # Process the remaining items in the buffer after all threads have completed + while buffer: + with buffer_lock: + while buffer and len(chunks) < SHARD_SIZE: + chunks.append(pop_random(buffer)) + + write_to_shard(chunks, shard_writer) + if upload_to_s3: + upload_to_s3_and_remove(shard_writer.fname) + chunks = [] + + +def tokenize_eleutherai(tokenizer, string): + return tokenizer(string).input_ids + + +# ========================================================= +# = Main function + Argument parsing = +# ========================================================= + + +def main( + input_files, + output_dir, + tokenizer="EleutherAI/gpt-neox-20b", + num_workers=32, + num_consumers=8, + upload_to_s3=False, +): + os.makedirs(f"{output_dir}/tars-{CHUNK_SIZE - 1}-v1", exist_ok=True) + + input_files = [glob.glob(input_file) for input_file in input_files] + input_files = [x for y in input_files for x in y] + + # Shuffle the input files + random.shuffle(input_files) + + print("Input files", input_files) + + enc = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b") + + tokenize = lambda x: tokenize_eleutherai(enc, x) + buffer = [] # Use list instead of queue.Queue + buffer_lock = threading.Lock() + + files_per_worker = len(input_files) // num_workers + threads = [] + for i in range(num_workers): + start = i * files_per_worker + end = (i + 1) * files_per_worker if i < num_workers - 1 else len(input_files) + t = threading.Thread( + target=process_files, + args=(input_files[start:end], buffer, tokenize, buffer_lock), + ) + t.start() + threads.append(t) + + consumer_threads = [] + for i in range(num_consumers): + t = threading.Thread( + target=consumer, + args=( + i, + output_dir, + threads, + buffer, + buffer_lock, + num_consumers, + upload_to_s3, + ), + ) + t.start() + consumer_threads.append(t) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input-files", type=str, nargs="+") + parser.add_argument("--output-dir", type=Path) + parser.add_argument("--tokenizer", type=str, default="EleutherAI/gpt-neox-20b") + parser.add_argument("--num-workers", type=int, default=32) + parser.add_argument("--num-consumers", type=int, default=8) + parser.add_argument("--upload-to-s3", action="store_true") + + args = parser.parse_args() + + main( + args.input_files, + args.output_dir, + args.tokenizer, + args.num_workers, + args.num_consumers, + args.upload_to_s3, + ) \ No newline at end of file diff --git a/open_lm/datapreprocess/make_2048.py b/open_lm/datapreprocess/make_2048.py index e0da8bb..69e7429 100644 --- a/open_lm/datapreprocess/make_2048.py +++ b/open_lm/datapreprocess/make_2048.py @@ -20,7 +20,7 @@ # ======================================== QUEUE_MAX = 10_000 -BUFFER_MIN = 10_000 +BUFFER_MIN = 100_000 BUFFER_MAX = 200_000 CHUNK_SIZE = 2048 + 1 SHARD_SIZE = 8192 @@ -252,4 +252,4 @@ def main( args.num_workers, args.num_consumers, args.upload_to_s3, - ) + ) \ No newline at end of file diff --git a/open_lm/datapreprocess/wiki_download.py b/open_lm/datapreprocess/wiki_download.py index f5f1a05..a4e10da 100644 --- a/open_lm/datapreprocess/wiki_download.py +++ b/open_lm/datapreprocess/wiki_download.py @@ -30,4 +30,4 @@ def main(output_dir): ) args = parser.parse_args() - main(args.output_dir) \ No newline at end of file + main(args.output_dir) diff --git a/open_lm/eval.py b/open_lm/eval.py deleted file mode 100644 index 196f006..0000000 --- a/open_lm/eval.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -from open_lm.params import parse_args -from open_lm.model import test_classif_model -import webdataset as wds -from open_lm.data import get_wds_dataset -from open_lm.data import sample_chunk - -args = parse_args([]) -args.per_gpu_val_batch_size = 8 -args.vocab_size = 50432 -args.seq_len = 2048 -args.world_size = 1 -args.rank = 0 - -args.model = "open_lm_160m" -model_path = "/media/logs/classif_C4160m3.2B_C4DCLM_320M/checkpoints/epoch_1.pt" - -args.val_data = ['/media/datasets/C4/C4-shard-0000219.tar'] - -model = test_classif_model(args, model_path) -model = model.to('cuda') - -dataset = get_wds_dataset(args, is_train=False, epoch=0, floor=True, tokenizer=None, data_key="txt", force_num_samples=None) - -dataloader = dataset.dataloader - -sum = 0 -for sample in dataloader: - (texts,) = sample - texts = torch.LongTensor(texts).to('cuda') - inputs, targets = sample_chunk(texts, args) - - with torch.no_grad(): - out, _, _ = model(inputs) - - pred = torch.argmax(out,2)[:,-1].sum() - - sum = sum + pred.item() - -print(sum) - - - - diff --git a/open_lm/eval2.py b/open_lm/eval2.py new file mode 100644 index 0000000..2d5121d --- /dev/null +++ b/open_lm/eval2.py @@ -0,0 +1,96 @@ +import torch +from open_lm.params import parse_args +import argparse +from open_lm.model import test_classif_model + +args = parse_args([]) +parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments") +parser.add_argument('--model', type=str, help='Model name to use for evaluation') +parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint') +parser.add_argument('--str1', type=str, help='test set 1') +parser.add_argument('--str2', type=str, help='test set 2') +cmd_args = parser.parse_args() +args.model = cmd_args.model +args.classif_model_path = cmd_args.classif_model_path + + + +########################################################################################################### + +args.num_classes = 2 + +#Dolma_gen.pt +#DCLM_gen.pt +#FWEdu_gen.pt + +#'C4.pt' +#'FineWeb.pt' +#'RefinedWeb.pt' + + +str1 = cmd_args.str1 +str2 = cmd_args.str2 + + +base_path = '/media/datasets/test_set/' + +data_path1 = base_path + str1 + '.pt' +data_path2 = base_path + str2 + '.pt' + + + +########################################################################################################### + +model = test_classif_model(args) +model = model.to('cuda') + + + +dataset = torch.load(data_path1) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 0).item() + + sum = sum + n_correct + +sum1 = sum +len1 = len(dataset) +print(str1, sum1, "/" , len1) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path2) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 1).item() + + sum = sum + n_correct + +sum2 = sum +len2 = len(dataset) +print(str2, sum2, "/" , len2) + +########################################################################################################################################################################################## + + +total_sum = sum1+sum2 +total_length = len1+len2 + +print("Total= ", total_sum, "/" , total_length ) +print("Accuracy= ", total_sum/total_length * 100, "%") + + diff --git a/open_lm/eval3.py b/open_lm/eval3.py new file mode 100644 index 0000000..9c1e761 --- /dev/null +++ b/open_lm/eval3.py @@ -0,0 +1,116 @@ +import torch +from open_lm.params import parse_args +import argparse +from open_lm.model import test_classif_model + +args = parse_args([]) +parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments") +parser.add_argument('--model', type=str, help='Model name to use for evaluation') +parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint') +parser.add_argument('--str1', type=str, help='test set 1') +parser.add_argument('--str2', type=str, help='test set 2') +parser.add_argument('--str3', type=str, help='test set 3') +cmd_args = parser.parse_args() +args.model = cmd_args.model +args.classif_model_path = cmd_args.classif_model_path + + + +########################################################################################################### + +args.num_classes = 3 + +#Dolma_gen.pt +#DCLM_gen.pt +#FWEdu_gen.pt + +#'C4.pt' +#'FineWeb.pt' +#'RefinedWeb.pt' + + +str1 = cmd_args.str1 +str2 = cmd_args.str2 +str3 = cmd_args.str3 + +base_path = '/media/datasets/test_set/' + +data_path1 = base_path + str1 + '.pt' +data_path2 = base_path + str2 + '.pt' +data_path3 = base_path + str3 + '.pt' + + +########################################################################################################### + +model = test_classif_model(args) +model = model.to('cuda') + + + +dataset = torch.load(data_path1) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 0).item() + + sum = sum + n_correct + +sum1 = sum +len1 = len(dataset) +print(str1, sum1, "/" , len1) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path2) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 1).item() + + sum = sum + n_correct + +sum2 = sum +len2 = len(dataset) +print(str2, sum2, "/" , len2) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path3) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 2).item() + + sum = sum + n_correct + +sum3 = sum +len3 = len(dataset) +print(str3, sum3, "/" , len3) + +########################################################################################################################################################################################## + +total_sum = sum1+sum2+sum3 +total_length = len1+len2+len3 + +print("Total= ", total_sum, "/" , total_length ) +print("Accuracy= ", total_sum/total_length * 100, "%") + + diff --git a/open_lm/eval3_prop.py b/open_lm/eval3_prop.py new file mode 100644 index 0000000..8338786 --- /dev/null +++ b/open_lm/eval3_prop.py @@ -0,0 +1,213 @@ +import torch +from open_lm.params import parse_args +import argparse +from open_lm.model import test_classif_model + +args = parse_args([]) +parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments") +parser.add_argument('--model', type=str, help='Model name to use for evaluation') +parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint') +parser.add_argument('--str1', type=str, help='test set 1') +parser.add_argument('--str2', type=str, help='test set 2') +#parser.add_argument('--str3', type=str, help='test set 3') +#parser.add_argument('--str4', type=str, help='test set 4') +#parser.add_argument('--str5', type=str, help='test set 5') +#parser.add_argument('--str6', type=str, help='test set 6') +cmd_args = parser.parse_args() +args.model = cmd_args.model +args.classif_model_path = cmd_args.classif_model_path + + + +########################################################################################################### + +args.num_classes = 2 + + +str1 = cmd_args.str1 +str2 = cmd_args.str2 +#str3 = cmd_args.str3 +#str4 = cmd_args.str4 +#str5 = cmd_args.str5 +#str6 = cmd_args.str6 + + +data1= "Llama1_gen" #"DCLM_gen" +data2= "Dolma_gen" +data3= "FWEdu_gen" + +base_path = '/media/datasets/test_set/' + +data_path1 = base_path + data1 + '.pt' +data_path2 = base_path + data2 + '.pt' +data_path3 = base_path + data3 + '.pt' + +model = test_classif_model(args) +model = model.to('cuda') + + +soft_max = torch.nn.Softmax(dim=2) +########################################################################################################### + +pred = [] +conf=[] + +dataset = torch.load(data_path1) + +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + out = soft_max(out) + pred.append( torch.argmax(out,2)[:,-1].item() ) + conf.append( torch.max(out,2)[0][:,-1].item() ) + +c1 = pred.count(0) +c2 = pred.count(1) +#c3 = pred.count(2) +#c4 = pred.count(3) +#c5 = pred.count(4) +#c6 = pred.count(5) + + +sum_conf1 = sum(c for p, c in zip(pred, conf) if p == 0) +sum_conf2 = sum(c for p, c in zip(pred, conf) if p == 1) +#sum_conf3 = sum(c for p, c in zip(pred, conf) if p == 2) +#sum_conf4 = sum(c for p, c in zip(pred, conf) if p == 3) +#sum_conf5 = sum(c for p, c in zip(pred, conf) if p == 4) +#sum_conf6 = sum(c for p, c in zip(pred, conf) if p == 5) + + +av1 = sum_conf1/c1 if c1>0 else 0 +av2 = sum_conf2/c2 if c2>0 else 0 +#av3 = sum_conf3/c3 if c3>0 else 0 +#av4 = sum_conf4/c4 if c4>0 else 0 +#av5 = sum_conf5/c5 if c5>0 else 0 +#av6 = sum_conf6/c6 if c6>0 else 0 + + + +length = len(dataset) + +print(data1, ':') +print(str1, c1, "/", length, '=', c1/length, "with confidence ", av1) +print(str2, c2, "/", length, '=', c2/length, "with confidence ", av2) +#print(str3, c3, "/", length, '=', c3/length, "with confidence ", av3) +#print(str4, c4, "/", length, '=', c4/length, "with confidence ", av4) +#print(str5, c5, "/", length, '=', c5/length, "with confidence ", av5) +#print(str6, c6, "/", length, '=', c6/length, "with confidence ", av6) +print("\n") + +exit() +########################################################################################################################################################################################## + +pred = [] +conf=[] + +dataset = torch.load(data_path2) + +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + out = soft_max(out) + pred.append( torch.argmax(out,2)[:,-1].item() ) + conf.append( torch.max(out,2)[0][:,-1].item() ) + +c1 = pred.count(0) +c2 = pred.count(1) +c3 = pred.count(2) +c4 = pred.count(3) +c5 = pred.count(4) +c6 = pred.count(5) +c7 = pred.count(6) + +sum_conf1 = sum(c for p, c in zip(pred, conf) if p == 0) +sum_conf2 = sum(c for p, c in zip(pred, conf) if p == 1) +sum_conf3 = sum(c for p, c in zip(pred, conf) if p == 2) +sum_conf4 = sum(c for p, c in zip(pred, conf) if p == 3) +sum_conf5 = sum(c for p, c in zip(pred, conf) if p == 4) +sum_conf6 = sum(c for p, c in zip(pred, conf) if p == 5) +sum_conf7 = sum(c for p, c in zip(pred, conf) if p == 6) + +av1 = sum_conf1/c1 if c1>0 else 0 +av2 = sum_conf2/c2 if c2>0 else 0 +av3 = sum_conf3/c3 if c3>0 else 0 +av4 = sum_conf4/c4 if c4>0 else 0 +av5 = sum_conf5/c5 if c5>0 else 0 +av6 = sum_conf6/c6 if c6>0 else 0 +av7 = sum_conf7/c7 if c7>0 else 0 + + +length = len(dataset) + +print(data2, ':') +print(str1, c1, "/", length, '=', c1/length, "with confidence ", av1) +print(str2, c2, "/", length, '=', c2/length, "with confidence ", av2) +print(str3, c3, "/", length, '=', c3/length, "with confidence ", av3) +print(str4, c4, "/", length, '=', c4/length, "with confidence ", av4) +print(str5, c5, "/", length, '=', c5/length, "with confidence ", av5) +print(str6, c6, "/", length, '=', c6/length, "with confidence ", av6) +print(str7, c7, "/", length, '=', c7/length, "with confidence ", av7) +print("\n") + +########################################################################################################################################################################################## + +pred = [] +conf=[] + +dataset = torch.load(data_path3) + +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + out = soft_max(out) + pred.append( torch.argmax(out,2)[:,-1].item() ) + conf.append( torch.max(out,2)[0][:,-1].item() ) + +c1 = pred.count(0) +c2 = pred.count(1) +c3 = pred.count(2) +c4 = pred.count(3) +c5 = pred.count(4) +c6 = pred.count(5) +c7 = pred.count(6) + +sum_conf1 = sum(c for p, c in zip(pred, conf) if p == 0) +sum_conf2 = sum(c for p, c in zip(pred, conf) if p == 1) +sum_conf3 = sum(c for p, c in zip(pred, conf) if p == 2) +sum_conf4 = sum(c for p, c in zip(pred, conf) if p == 3) +sum_conf5 = sum(c for p, c in zip(pred, conf) if p == 4) +sum_conf6 = sum(c for p, c in zip(pred, conf) if p == 5) +sum_conf7 = sum(c for p, c in zip(pred, conf) if p == 6) + +av1 = sum_conf1/c1 if c1>0 else 0 +av2 = sum_conf2/c2 if c2>0 else 0 +av3 = sum_conf3/c3 if c3>0 else 0 +av4 = sum_conf4/c4 if c4>0 else 0 +av5 = sum_conf5/c5 if c5>0 else 0 +av6 = sum_conf6/c6 if c6>0 else 0 +av7 = sum_conf7/c7 if c7>0 else 0 + + +length = len(dataset) + +print(data3, ':') +print(str1, c1, "/", length, '=', c1/length, "with confidence ", av1) +print(str2, c2, "/", length, '=', c2/length, "with confidence ", av2) +print(str3, c3, "/", length, '=', c3/length, "with confidence ", av3) +print(str4, c4, "/", length, '=', c4/length, "with confidence ", av4) +print(str5, c5, "/", length, '=', c5/length, "with confidence ", av5) +print(str6, c6, "/", length, '=', c6/length, "with confidence ", av6) +print(str7, c7, "/", length, '=', c7/length, "with confidence ", av7) +print("\n") +########################################################################################################################################################################################## + + diff --git a/open_lm/eval3_prop_2048.py b/open_lm/eval3_prop_2048.py new file mode 100644 index 0000000..d269b36 --- /dev/null +++ b/open_lm/eval3_prop_2048.py @@ -0,0 +1,139 @@ +import torch +from open_lm.params import parse_args +import argparse +from open_lm.model import test_classif_model +import webdataset as wds +from open_lm.data import get_wds_dataset +from open_lm.data import sample_chunk + +args = parse_args([]) +parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments") +parser.add_argument('--model', type=str, help='Model name to use for evaluation') +parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint') +parser.add_argument('--str1', type=str, help='test set 1') +parser.add_argument('--str2', type=str, help='test set 2') +parser.add_argument('--str3', type=str, help='test set 3') +parser.add_argument('--str4', type=str, help='test set 4') +cmd_args = parser.parse_args() +args.model = cmd_args.model +args.classif_model_path = cmd_args.classif_model_path + + +args.per_gpu_val_batch_size = 1 +args.vocab_size = 50432 +args.seq_len = 2047 +args.world_size = 1 +args.rank = 0 + +########################################################################################################### + +args.num_classes = 4 + + +str1 = cmd_args.str1 +str2 = cmd_args.str2 +str3 = cmd_args.str3 +str4 = cmd_args.str4 + +data1= "DCLM" +data2= "Dolma" +data3= "FWEdu" + +base_path = '/media/datasets/test_set/' + +data_path1 = base_path + data1 + '.tar' +data_path2 = base_path + data2 + '.tar' +data_path3 = base_path + data3 + '.tar' + +model = test_classif_model(args) +model = model.to('cuda') + +########################################################################################################### + +args.val_data = [data_path1] +dataset = get_wds_dataset(args, is_train=False, epoch=0, floor=True, tokenizer=None, data_key="txt", force_num_samples=None) +dataloader = dataset.dataloader + + +pred = [] +for sample in dataloader: + (texts,) = sample + inputs = torch.LongTensor(texts).to('cuda') + + with torch.no_grad(): + out, _, _ = model(inputs) + + pred.append( torch.argmax(out,2)[:,-1].item() ) + +c1 = pred.count(0) +c2 = pred.count(1) +c3 = pred.count(2) +c4 = pred.count(3) + +length = 4096 + +print(data1, ':') +print(str1, c1, "/", length, '=', c1/length) +print(str2, c2, "/", length, '=', c2/length) +print(str3, c3, "/", length, '=', c3/length) +print(str4, c4, "/", length, '=', c4/length) + +########################################################################################################################################################################################## + +args.val_data = [data_path2] +dataset = get_wds_dataset(args, is_train=False, epoch=0, floor=True, tokenizer=None, data_key="txt", force_num_samples=None) +dataloader = dataset.dataloader + + +pred = [] +for sample in dataloader: + (texts,) = sample + inputs = torch.LongTensor(texts).to('cuda') + + with torch.no_grad(): + out, _, _ = model(inputs) + + pred.append( torch.argmax(out,2)[:,-1].item() ) + +c1 = pred.count(0) +c2 = pred.count(1) +c3 = pred.count(2) + +length = 4096 + +print(data2, ':') +print(str1, c1, "/", length, '=', c1/length) +print(str2, c2, "/", length, '=', c2/length) +print(str3, c3, "/", length, '=', c3/length) + +########################################################################################################################################################################################## + +args.val_data = [data_path3] +dataset = get_wds_dataset(args, is_train=False, epoch=0, floor=True, tokenizer=None, data_key="txt", force_num_samples=None) +dataloader = dataset.dataloader + + +pred = [] +for sample in dataloader: + (texts,) = sample + inputs = torch.LongTensor(texts).to('cuda') + + with torch.no_grad(): + out, _, _ = model(inputs) + + pred.append( torch.argmax(out,2)[:,-1].item() ) + +c1 = pred.count(0) +c2 = pred.count(1) +c3 = pred.count(2) + +length = 4096 + +print(data3, ':') +print(str1, c1, "/", length, '=', c1/length) +print(str2, c2, "/", length, '=', c2/length) +print(str3, c3, "/", length, '=', c3/length) + +########################################################################################################################################################################################## + + diff --git a/open_lm/eval3_varylength.py b/open_lm/eval3_varylength.py new file mode 100644 index 0000000..e6f1c5c --- /dev/null +++ b/open_lm/eval3_varylength.py @@ -0,0 +1,118 @@ +import torch +from open_lm.params import parse_args +import argparse +from open_lm.model import test_classif_model + +args = parse_args([]) +parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments") +parser.add_argument('--model', type=str, help='Model name to use for evaluation') +parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint') +parser.add_argument('--str1', type=str, help='test set 1') +parser.add_argument('--str2', type=str, help='test set 2') +parser.add_argument('--str3', type=str, help='test set 3') +cmd_args = parser.parse_args() +args.model = cmd_args.model +args.classif_model_path = cmd_args.classif_model_path + + + +########################################################################################################### + +args.num_classes = 3 + +#Dolma_gen.pt +#DCLM_gen.pt +#FWEdu_gen.pt + +#'C4.pt' +#'FineWeb.pt' +#'RefinedWeb.pt' + + +str1 = cmd_args.str1 +str2 = cmd_args.str2 +str3 = cmd_args.str3 + +base_path = '/media/datasets/test_set/' + +data_path1 = base_path + str1 + '.pt' +data_path2 = base_path + str2 + '.pt' +data_path3 = base_path + str3 + '.pt' + +model = test_classif_model(args) +model = model.to('cuda') + +########################################################################################################### + +dataset = torch.load(data_path1) +n_bins = len(dataset) +sum = torch.zeros(n_bins, dtype=torch.int) + +for i in range(n_bins): + n_samples = len(dataset[i]) + for j in range(n_samples): + sample = torch.LongTensor(dataset[i][j]).to('cuda') + with torch.no_grad(): + out, _, _ = model(sample) + pred = torch.argmax(out,2)[:,-1] + + if pred == 0: + sum[i] +=1 + + +sum1 = sum +len1 = n_samples +print(str1, sum1) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path2) +n_bins = len(dataset) +sum = torch.zeros(n_bins, dtype=torch.int) + +for i in range(n_bins): + n_samples = len(dataset[i]) + for j in range(n_samples): + sample = torch.LongTensor(dataset[i][j]).to('cuda') + with torch.no_grad(): + out, _, _ = model(sample) + pred = torch.argmax(out,2)[:,-1] + + if pred == 1: + sum[i] +=1 + +sum2 = sum +len2 = n_samples +print(str2, sum2) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path3) +n_bins = len(dataset) +sum = torch.zeros(n_bins, dtype=torch.int) + +for i in range(n_bins): + n_samples = len(dataset[i]) + for j in range(n_samples): + sample = torch.LongTensor(dataset[i][j]).to('cuda') + with torch.no_grad(): + out, _, _ = model(sample) + pred = torch.argmax(out,2)[:,-1] + + if pred == 2: + sum[i] +=1 + +sum3 = sum +len3 = n_samples +print(str3, sum3) + +########################################################################################################################################################################################## + +total_sum = sum1+sum2+sum3 +total_len = len1+len2+len3 + +print(len1,len2,len3,"\n") + +for i in range(n_bins): + print("Accuracy at bin ", i, " Seq. lengths range from ", i*200, " to ", i*200+200, " is: ", total_sum[i].item()/total_len * 100, "%") + diff --git a/open_lm/eval3_varylength_2000.py b/open_lm/eval3_varylength_2000.py new file mode 100644 index 0000000..06ada50 --- /dev/null +++ b/open_lm/eval3_varylength_2000.py @@ -0,0 +1,124 @@ +import torch +from open_lm.params import parse_args +import argparse +from open_lm.model import test_classif_model + +args = parse_args([]) +parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments") +parser.add_argument('--model', type=str, help='Model name to use for evaluation') +parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint') +parser.add_argument('--str1', type=str, help='test set 1') +parser.add_argument('--str2', type=str, help='test set 2') +parser.add_argument('--str3', type=str, help='test set 3') +cmd_args = parser.parse_args() +args.model = cmd_args.model +args.classif_model_path = cmd_args.classif_model_path + + + +########################################################################################################### + +args.num_classes = 3 + +#Dolma_gen.pt +#DCLM_gen.pt +#FWEdu_gen.pt + +#'C4.pt' +#'FineWeb.pt' +#'RefinedWeb.pt' + + +str1 = cmd_args.str1 +str2 = cmd_args.str2 +str3 = cmd_args.str3 + +base_path = '/media/datasets/test_set/' + +data_path1 = base_path + str1 + '.pt' +data_path2 = base_path + str2 + '.pt' +data_path3 = base_path + str3 + '.pt' + + +########################################################################################################### + +model = test_classif_model(args) +model = model.to('cuda') + + +indices = torch.arange(0, 2048, 200) + +sum = torch.zeros(len(indices)) + +dataset = torch.load(data_path1) + +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,indices] + + n_correct = torch.sum(pred == 0, dim=0) + + sum = sum + n_correct.cpu() + +sum1 = sum +len1 = len(dataset) +print(str1, sum1) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path2) + +sum = torch.zeros(len(indices)) + +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,indices] + + n_correct = torch.sum(pred == 1, dim=0) + + sum = sum + n_correct.cpu() + +sum2 = sum +len2 = len(dataset) +print(str2, sum2) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path3) + +sum = torch.zeros(len(indices)) + +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,indices] + + n_correct = torch.sum(pred == 2, dim=0) + + sum = sum + n_correct.cpu() + +sum3 = sum +len3 = len(dataset) +print(str3, sum3) + +########################################################################################################################################################################################## + +total = sum1+sum2+sum3 + +print(len1,len2,len3,"\n") + +for i in range(len(indices)): + print("Accuracy at token", indices[i], "=", total[i].item()/(len1+len2+len3)) + + diff --git a/open_lm/eval4.py b/open_lm/eval4.py new file mode 100644 index 0000000..a732bbf --- /dev/null +++ b/open_lm/eval4.py @@ -0,0 +1,139 @@ +import torch +from open_lm.params import parse_args +import argparse +from open_lm.model import test_classif_model + +args = parse_args([]) +parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments") +parser.add_argument('--model', type=str, help='Model name to use for evaluation') +parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint') +parser.add_argument('--str1', type=str, help='test set 1') +parser.add_argument('--str2', type=str, help='test set 2') +parser.add_argument('--str3', type=str, help='test set 3') +parser.add_argument('--str4', type=str, help='test set 4') +cmd_args = parser.parse_args() +args.model = cmd_args.model +args.classif_model_path = cmd_args.classif_model_path + + + +########################################################################################################### + +args.num_classes = 4 + +#Dolma_gen.pt +#DCLM_gen.pt +#FWEdu_gen.pt + +#'C4.pt' +#'FineWeb.pt' +#'RefinedWeb.pt' + + +str1 = cmd_args.str1 +str2 = cmd_args.str2 +str3 = cmd_args.str3 +str4 = cmd_args.str4 + +base_path = '/media/datasets/test_set/' + +data_path1 = base_path + str1 + '.pt' +data_path2 = base_path + str2 + '.pt' +data_path3 = base_path + str3 + '.pt' +data_path4 = base_path + str4 + '.pt' + + +########################################################################################################### + +model = test_classif_model(args) +model = model.to('cuda') + + + +dataset = torch.load(data_path1) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 0).item() + + sum = sum + n_correct + +sum1 = sum +len1 = len(dataset) +print(str1, sum1, "/" , len1) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path2) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 1).item() + + sum = sum + n_correct + +sum2 = sum +len2 = len(dataset) +print(str2, sum2, "/" , len2) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path3) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 2).item() + + sum = sum + n_correct + +sum3 = sum +len3 = len(dataset) +print(str3, sum3, "/" , len3) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path4) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 3).item() + + sum = sum + n_correct + +sum4 = sum +len4 = len(dataset) +print(str4, sum4, "/" , len4) + +########################################################################################################################################################################################## + +total_sum = sum1+sum2+sum3+sum4 +total_length = len1+len2+len3+len4 + +print("Total= ", total_sum, "/" , total_length ) +print("Accuracy= ", total_sum/total_length * 100, "%") + + diff --git a/open_lm/eval5.py b/open_lm/eval5.py new file mode 100644 index 0000000..681223c --- /dev/null +++ b/open_lm/eval5.py @@ -0,0 +1,162 @@ +import torch +from open_lm.params import parse_args +import argparse +from open_lm.model import test_classif_model + +args = parse_args([]) +parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments") +parser.add_argument('--model', type=str, help='Model name to use for evaluation') +parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint') +parser.add_argument('--str1', type=str, help='test set 1') +parser.add_argument('--str2', type=str, help='test set 2') +parser.add_argument('--str3', type=str, help='test set 3') +parser.add_argument('--str4', type=str, help='test set 4') +parser.add_argument('--str5', type=str, help='test set 5') +cmd_args = parser.parse_args() +args.model = cmd_args.model +args.classif_model_path = cmd_args.classif_model_path + + + +########################################################################################################### + +args.num_classes = 5 + +#Dolma_gen.pt +#DCLM_gen.pt +#FWEdu_gen.pt + +#'C4.pt' +#'FineWeb.pt' +#'RefinedWeb.pt' + + +str1 = cmd_args.str1 +str2 = cmd_args.str2 +str3 = cmd_args.str3 +str4 = cmd_args.str4 +str5 = cmd_args.str5 + +base_path = '/media/datasets/test_set/' + +data_path1 = base_path + str1 + '.pt' +data_path2 = base_path + str2 + '.pt' +data_path3 = base_path + str3 + '.pt' +data_path4 = base_path + str4 + '.pt' +data_path5 = base_path + str5 + '.pt' + + +########################################################################################################### + +model = test_classif_model(args) +model = model.to('cuda') + + + +dataset = torch.load(data_path1) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 0).item() + + sum = sum + n_correct + +sum1 = sum +len1 = len(dataset) +print(str1, sum1, "/" , len1) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path2) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 1).item() + + sum = sum + n_correct + +sum2 = sum +len2 = len(dataset) +print(str2, sum2, "/" , len2) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path3) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 2).item() + + sum = sum + n_correct + +sum3 = sum +len3 = len(dataset) +print(str3, sum3, "/" , len3) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path4) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 3).item() + + sum = sum + n_correct + +sum4 = sum +len4 = len(dataset) +print(str4, sum4, "/" , len4) + +########################################################################################################################################################################################## + +dataset = torch.load(data_path5) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 4).item() + + sum = sum + n_correct + +sum5 = sum +len5 = len(dataset) +print(str5, sum5, "/" , len5) + +########################################################################################################################################################################################## + +total_sum = sum1+sum2+sum3+sum4+sum5 +total_length = len1+len2+len3+len4+len5 + +print("Total= ", total_sum, "/" , total_length ) +print("Accuracy= ", total_sum/total_length * 100, "%") + + diff --git a/open_lm/eval_redpajama_seq.py b/open_lm/eval_redpajama_seq.py new file mode 100644 index 0000000..85fe453 --- /dev/null +++ b/open_lm/eval_redpajama_seq.py @@ -0,0 +1,167 @@ +import torch +from open_lm.params import parse_args +import argparse +from open_lm.model import test_classif_model + +args = parse_args([]) +parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments") +parser.add_argument('--model', type=str, help='Model name to use for evaluation') +parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint') +cmd_args = parser.parse_args() +args.model = cmd_args.model +args.classif_model_path = cmd_args.classif_model_path + + + +########################################################################################################### +args.num_classes = 6 + +path1 = "/media/datasets/RedPajama/val_seq/arxiv-shard-0000019.pt" +path2 = "/media/datasets/RedPajama/val_seq/c4-shard-0000019.pt" +path3 = "/media/datasets/RedPajama/val_seq/cc-shard-0000019.pt" +path4 = "/media/datasets/RedPajama/val_seq/gh-shard-0000019.pt" +path5 = "/media/datasets/RedPajama/val_seq/se-shard-0000019.pt" +path6 = "/media/datasets/RedPajama/val_seq/wiki-shard-0000009.pt" + +str1 = "Arxiv" +str2 = "C4" +str3 = "CC" +str4 = "Github" +str5 = "StackExchange" +str6 = "Wikipedia" +########################################################################################################### + +model = test_classif_model(args) +model = model.to('cuda') + + + +dataset = torch.load(path1) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 0).item() + + sum = sum + n_correct + +sum1 = sum +len1 = len(dataset) +print(str1, sum1, "/" , len1) + +########################################################################################################################################################################################## + +dataset = torch.load(path2) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 1).item() + + sum = sum + n_correct + +sum2 = sum +len2 = len(dataset) +print(str2, sum2, "/" , len2) + +########################################################################################################################################################################################## + +dataset = torch.load(path3) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 2).item() + + sum = sum + n_correct + +sum3 = sum +len3 = len(dataset) +print(str3, sum3, "/" , len3) + +########################################################################################################################################################################################## + +dataset = torch.load(path4) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 3).item() + + sum = sum + n_correct + +sum4 = sum +len4 = len(dataset) +print(str4, sum4, "/" , len4) + +########################################################################################################################################################################################## + +dataset = torch.load(path5) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 4).item() + + sum = sum + n_correct + +sum5 = sum +len5 = len(dataset) +print(str5, sum5, "/" , len5) + +########################################################################################################################################################################################## + +dataset = torch.load(path6) +sum = 0 +for sample in dataset: + sample = torch.LongTensor(sample).to('cuda') + + with torch.no_grad(): + out, _, _ = model(sample) + + pred = torch.argmax(out,2)[:,-1] + + n_correct = torch.sum(pred == 5).item() + + sum = sum + n_correct + +sum6 = sum +len6 = len(dataset) +print(str6, sum6, "/" , len6) + +########################################################################################################################################################################################## + + + +total_sum = sum1+sum2+sum3+sum4+sum5+sum6 +total_length = len1+len2+len3+len4+len5+len6 + +print("Total= ", total_sum, "/" , total_length ) +print("Accuracy= ", total_sum/total_length * 100, "%") + + diff --git a/open_lm/extra_funcs.py b/open_lm/extra_funcs.py deleted file mode 100644 index b746d5e..0000000 --- a/open_lm/extra_funcs.py +++ /dev/null @@ -1,214 +0,0 @@ -import os -import shutil -import random -import json -import torch -import numpy as np -import subprocess - -from open_lm.params import parse_args -from open_lm.model import test_classif_model - -def inference(): - - args = parse_args([]) - args.model = "open_lm_25m" - args.classif_model_path = "/workspace/youssef/lrz/logs/RedPajama/prop/checkpoints/epoch_1.pt" - args.num_classes = 2 - - test_data_path = '/workspace/youssef/lrz/datasets/prop/Llama1_gen.pt' - dataset = torch.load(test_data_path) - - model = test_classif_model(args) - model = model.to('cuda:3') - - pred = [] - for sample in dataset: - sample = torch.LongTensor(sample).to('cuda:3') - with torch.no_grad(): - out, _, _ = model(sample) - pred.append(torch.argmax(out,2)[:,-1].item()) - - c1 = pred.count(0) - c2 = pred.count(1) - - print(c1,c2) - - if c2 > c1: - return 1 - else: - return 0 - -def train_classifier(cuda_devices="3", log_dir="/workspace/youssef/lrz/logs/RedPajama/prop"): - # Set the CUDA_VISIBLE_DEVICES environment variable - os.environ["CUDA_VISIBLE_DEVICES"] = cuda_devices - - # Generate a random master port between 10000 and 65000 - master_port = random.randint(10000, 65000) - - # Construct the torchrun command - command = [ - "torchrun", - f"--master_port={master_port}", - "--nproc-per-node", "1", - "-m", "open_lm.main", - "--model", "open_lm_25m", - "--dataset-manifest", "/workspace/youssef/lrz/datasets/prop/train/manifest.jsonl", - "--train-num-samples", "200000000", - "--workers", "1", - "--precision", "amp_bfloat16", - "--grad-checkpointing", - "--log-every-n-steps", "100", - "--grad-clip-norm", "1", - "--global-batch-size", "16", - "--data-key", "txt", - "--lr", "3e-4", - "--warmup", "2000", - "--wd", "0.1", - "--beta2", "0.95", - "--epochs", "1", - "--resume", "latest", - "--logs", "/workspace/youssef/lrz/logs/RedPajama/", - "--name", "prop", - "--classification", "True", - "--num-classes", "2", - "--classif-model-path", "/workspace/youssef/lrz/logs/pretrain/25M_0.5BC4/checkpoint/epoch_1.pt" - ] - - os.makedirs(log_dir, exist_ok=True) - - # Create log file paths - stdout_log = os.path.join(log_dir, "output.log") - stderr_log = os.path.join(log_dir, "error.log") - - # Run the torchrun command using subprocess - with open(stdout_log, "w") as out_file, open(stderr_log, "w") as err_file: - try: - result = subprocess.run(command, check=True, stdout=out_file, stderr=err_file) - print(f"torchrun finished with return code: {result.returncode}") - except subprocess.CalledProcessError as e: - print(f"An error occurred while running torchrun: {e}") - - - -def proj_simplex(y): - m = len(y) - bget = False - s = sorted(y, reverse=True) # sorting in descending order - tmpsum = 0 - for i in range(m-1): - tmpsum = tmpsum + s[i] - tmax = (tmpsum - 1) / (i+1) - if tmax >= s[i+1]: - bget = True - break - if not bget: - tmax = (tmpsum + s[m-1] -1) / m - return np.maximum(y-tmax,0) - - - -def del_dir(dir_path): - try: - # Remove the directory and all its contents - shutil.rmtree(dir_path) - print(f"Removed directory: {dir_path}") - except FileNotFoundError: - print(f"Directory not found: {dir_path}") - except PermissionError: - print(f"Permission denied: {dir_path}") - except Exception as e: - print(f"An error occurred while removing the directory: {e}") - - -def round_preserving_sum(numbers): - """ - This function takes a list of numbers that add up to 1, multiplies each by 100, - rounds them to integers while preserving the sum as 100. - """ - # Step 1: Multiply all numbers by 100 - multiplied = np.array(numbers) * 100 - - # Step 2: Separate integer and decimal parts - integers = np.floor(multiplied).astype(int) # Integer parts - decimals = multiplied - integers # Decimal parts - - # Step 3: Calculate the difference between the current sum and 100 - current_sum = np.sum(integers) - difference = 100 - current_sum - - # Step 4: Distribute the difference by rounding up the largest decimals - if difference > 0: - # Get indices of the largest decimals and round up those numbers - indices_to_round_up = np.argsort(-decimals)[:difference] - integers[indices_to_round_up] += 1 - - return integers.tolist() - -def sample_and_rename_files(sample_counts_list): - - base_path = "/workspace/youssef/lrz/datasets/prop/original/" - output_folder = "/workspace/youssef/lrz/datasets/prop/train/" - - # Define the folder names in order - file_names = ['arxiv', 'c4', 'cc', 'github', 'se', 'wiki'] - folder_names = [os.path.join(base_path, folder) for folder in file_names] - - # Check if the provided sample_counts_list contains exactly two lists - if len(sample_counts_list) != 2 or any(len(sample_counts) != 6 for sample_counts in sample_counts_list): - raise ValueError("sample_counts_list must contain exactly two lists, each with 6 numbers.") - - # Create the output folder if it doesn't exist - if not os.path.exists(output_folder): - os.makedirs(output_folder) - - # List to store the manifest data - manifest_data = [] - - # Loop over the two lists of sample counts - for index, sample_counts in enumerate(sample_counts_list): - # Iterate through each folder and sample the required number of .tar files - for i, folder in enumerate(folder_names): - folder_path = os.path.join(folder) - - if not os.path.exists(folder_path): - raise ValueError(f"Folder {folder_path} does not exist.") - - # Get all .tar files from the current folder - all_files = [f for f in os.listdir(folder_path) if f.endswith('.tar')] - - # Ensure the sample count is not more than available files - sample_count = min(sample_counts[i], len(all_files)) - - # Randomly sample the required number of files from the folder - sampled_files = random.sample(all_files, sample_count) - - # Copy each sampled file to the output folder with the new name - for file_name in sampled_files: - # Construct source file path - source_file_path = os.path.join(folder_path, file_name) - - # Create the new filename by prepending the index (0 or 1) with a dash - new_file_name = f"{index}-{file_name[:-4]}" # Remove the .tar extension - - # Destination path in the output folder - dest_file_path = os.path.join(output_folder, new_file_name + '.tar') # Keep .tar in destination - - # Copy the file to the output folder - shutil.copy2(source_file_path, dest_file_path) - - # Add entry to manifest_data, replacing ".tar" in new_file_name with an empty string - manifest_entry = { - "shard": new_file_name, # No .tar extension - "num_sequences": 489 # Set a fixed number of sequences - } - manifest_data.append(manifest_entry) - - # Write the manifest.jsonl file - manifest_file_path = os.path.join(output_folder, "manifest.jsonl") - with open(manifest_file_path, 'w') as manifest_file: - # Write each entry except the last one with a newline - for entry in manifest_data: - manifest_file.write(json.dumps(entry) + '\n') - - print(f"Files sampled and saved in {output_folder}. Manifest file created as {manifest_file_path}.") \ No newline at end of file diff --git a/open_lm/hf/__init__.py b/open_lm/hf/__init__.py new file mode 100644 index 0000000..8493168 --- /dev/null +++ b/open_lm/hf/__init__.py @@ -0,0 +1,3 @@ +from .configuration_openlm import OpenLMConfig +from .modeling_openlm import OpenLMForCausalLM +from .tokenization_openlm import OpenLMTokenizerFast diff --git a/open_lm/hf/__pycache__/__init__.cpython-310.pyc b/open_lm/hf/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72a80db20c9a931ace21be1fd1eb1076b71a4f27 GIT binary patch literal 319 zcmd1j<>g`kg1x`bq)i6Wk3k${zy#zt0CBMfkVs)jVa#F3WsG8E1hJWNm~xq;n89r3 z9F|}!P-iT5=%1k^WyV?M&;xdfm{^DpPQeOnv@{qp>x?BasN1kSd%&UCnQ;dBbR3Qs{KhRMXvzQVU-Y07EKw0{M$WTVNL`KTL@!5{@?x!+VI}aY5 zNIsFp$2q>puj4fyXfLln8H3%H{m?f!dlMF1PJ_jy}E47grelVLBo|Jvq_sg$t>@4?{pvb}A5*|nv%2KBJnIEiIX%Y=Zrv`{U_ccYBiMIagj nZ0{1Iiqf`_!|84rr0SBxLH8Sc4;Cc^-aMGnsh4_le?IvKM$p?? literal 0 HcmV?d00001 diff --git a/open_lm/hf/__pycache__/modeling_openlm.cpython-310.pyc b/open_lm/hf/__pycache__/modeling_openlm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84b00fbcefa4508866c19f33a5e5d6f73de281e6 GIT binary patch literal 5904 zcma)ATXP&o6`tvtoxQK5wQNhiPMi>X9Y;=82uYbFKsh!^Vao<91cu9KwB0L>JUg@L znYAp{E~H{p;U)ZnRAEy*;1~V?#XsPI=YHkMk3hmDH}jpIU1?>>z;4Z%)2B~QpFVwV z-ATP(G4T82XMgH^{kUQLg&LDT8;x(`N&gJM4Q@t;M{gFHUFI>9a4WL9wr6(>UZLxF zPPgb4HQkI#UJ3nnRPI*1il!B!YPaUqH10(8Zo_M6yckV&r@d*7m!g?&(`$AQcn5U5 z9L;tQdIvRLi4JuSdxwEnc`cgj&U^EwA&h%R#L@evcZ}D?e24J{pSo*%$G>gxX+Cq; z;4`9f$MQ~y>3b)|yf|^6#0l%2;ff%ISn6;%UoP_LVTrROwnT3zImARAH&# zi-an?9it7-sq0cK$smjcznX9nfmFB{wATcXLl=X78bnvFUh8MQe)dk7tz8e&3_Y{o z>Su}OznsW1SBoj3rA_m5P03jrNZ`{~&Tl00X4(td z;{0aPPgBu3pFp{O)ZKosb7U({XbZK|_vDS}M+lOOgwGU@s~DYD#BM)MuQ zOeq1oP%~?0(`4^8ScTP0dTb`)fkqq8uHVs7E9KZ%VR$AtJjPAW61G?1%yak|ZVQ_; zQ52kvnQ_OsXAaE!rU_R}=iyFp z&w=4v9B|$8ZS8E9ra9cW0WS1vti=|U)$a9MMO8@qm|c~8Ka9i7_q91mG9Rz1DO}tN z+d|erktYFKma2E$y6(5iA|doYwO4!O)ZS) zSLHB96!b%$s=7u~UvyUlnhiLxh$#;E-5|X=*urFhuhwV%?^kBc2q#;eOoV>G5 z3mg9^k~nxS;z>US7%*=1OzI0k#)&%0Zo%kwh!MfDGLD z1?yPz#=XLQV>k_tGSVQ?;9;6JlR4|fdnNQX2h;b;yaMxqpWkQTo}u=Ab2tO6Hke+o zYaD!onLBJal{E%agF57D@Y;tqyr90g_42M=bHmh4Mdq&bGdJILmnX3QILTa?UN4cE z8)ojN$hKbI-V~qqlScku1DCd5*t4rNn5MvY$I}SzhVe*-trz#(^#3sk$>CY-t2{&C zEP-7((&217 zrm0#xi2YR&3yCN(#GsUR-Zr;R?45LybyqAeVhDwb@6h;P0*pe%D;Z%X(hXb_#TJvZ zJMEzzmq#sdtTW;nA}r+Cz`kkA(*t{0h>K$yXLAS*1EhX(_W^=L?({=s4deprp2aP0 z-$$7F$Qn8W2kz@EYID{$toMJZW#v|~ZSOGcN|=T*;$++w$_nEQNn|ZRHD>jaUaL4^ zT6)kt7O1K^Hb){}&DOkGWW(f^pi-w3-u83qxX8+*$75Eaj)S*{;OYbO|!w8cEfVahBeEW#b6=M7j|Lm{N4@vuit#4t)wMPgb-=0<23NVTztTC zZ*64B%{*0ccylT*f$i~S%u+$FmoUNk5lK&GR5G`o-7DpOLsUVhxHgR%{|Ki?Cm_m9 z*kwrRWB+d$auEc14FKY5-~S#F9<%OQMCv`_{Thr~yxP9v?KkAfx|_qta^1-i-Z(Cx zE}=J)hnDd{ArD`*4H2%cWq!5^WoiY>>lmw?JofCX-Gij6QDE0KgWibWko@FsB?OZa z+d93!iua6rvT|K~Y_y1qRx(4;MFkcqocYM|V}-vBPI-gCW12olYTGmA@4z_{6{Y^Z zqV7BHiTailR9h_g1csptOgdFvOnM=bA)PkdYgeu&?inu9whZ;*)kVdWY_t~Kx7^%I zku@_nh`F1s36Ro$l%?)^KSfFxZ5*xK)j+NUtKyuSCg4Hdg{$l29Sgw)&kI|ccZoX3 zM>)#12pxG57DT9xBhkS`sT&}1KcDn=#=WC0!PS;P!`2{8gav@NlSGgTg@+*kg5Quj6Y@oZ2jYlf;6LbpK}*+;K`RpCbT@t#mjD2WNXR9 zh3$|QlZ8TWLaqTQm%HYI91JFz=8W+I7Wy|A3y2rO7<4MIxaI( zDl;GUm7;`SbL>~DAc=JVA zTdlo|c*3Y$l%w;x2Ko}%ybq>WSGH0DNe zq@5x^3VJi-KBEhcJkv{BetwcXMcAt}sB$HVSC^p4M5^lLC<(Gr!^z*06sYW#B#9(t zUP+-`76?!&E2(~zmkC@Y@EU+}u4-}RcZsq@;0FYLNWdfTCV(nmqH`tX2>DZL*GGrg zwuqwfK~}5&85*LC)f2#|Hco#AP(s~j+qQX(vBtRkoOoOI-)zRdELrS}lFdFZ71(Dr zhkaTqvVS`z_AjT*{^?ZMKb$K2gw-(a5&O6@RrzblpNNu zA331vhc&;HBumRxML6Ut)}X2gBZ0p86RExkQx5LPXmzB)T=!aST7g>kwTU>CN-9ge zxk=JOe(puIiBhe6@A$569bz%VD3hgnN7U|5Xr6Q-z*VNKUh+F(_B6;i>{UniGM$x2JOWt&6YKzDESpUyGdz%{JtgT~LN=l%ngJu1uq literal 0 HcmV?d00001 diff --git a/open_lm/hf/__pycache__/tokenization_openlm.cpython-310.pyc b/open_lm/hf/__pycache__/tokenization_openlm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd3cc05fe64b54d1ea671e7d0dac39bc7420b4be GIT binary patch literal 525 zcmY*Wu};G<5Vf6zhANdRKw@Oh()>VG6(PX_1&K~sEUUh>p>bU7M5wH=@*%A3`~hF! zm5E<~1v{xqImu_=o$uYfJK5`Xk(KwKkL-;g^y!RU@vSkoM~|%Hh~okYoMJ&zf}KqY zPxz^i(FNij_n#5>fjoI>0QhbXd+A3>Sp8wFFv z@)G#gdCqhdV-sDkpuD^~lVz6AXD)qLur} zfKtJ^(ovX-^hkR{SKMGLXL7g}c zlDc>y2CKCSEKcY;baM=mPa2OUc9dCybdv>&&I+dWB<`4~QmoXOR05P%b|px8R;4GK wqJu>?m^I_5)owNADm3UOR4~tV1?rDR`*N!`;jm>ra?{hUxju Union[Tuple, CausalLMOutputWithPast]: + if inputs_embeds is not None: + log.warning("inputs_embeds is set but OpenLM does not support it yet") + if attention_bias is not None: + log.warning("attention_bias is et but OpenLM does not support it yet") + if use_cache is None: + use_cache = True + if output_attentions: + raise ValueError("output_attentions is not yet supported in OpenLM") + if output_hidden_states: + raise ValueError("output_hidden_states is not yet supported in OpenLM") + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + # print("outer past_key_values: ", type(past_key_values)) + # if past_key_values is not None: + # print(len(past_key_values), type(past_key_values[0])) + outputs = self.model.forward( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + ) + + logits = outputs[0] + past_key_values = outputs[2] + hidden_states = None + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = torch.nn.CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.model_config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=past_key_values, + hidden_states=hidden_states, + ) + + def can_generate(self) -> bool: + return True + + def prepare_inputs_for_generation( + self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs + ): + if past_key_values is not None: + if isinstance(past_key_values[0][1], int): + # This assumes that the second item of past key values is the length of the past (this is the case for linear attention) + past_length = past_key_values[0][1] + else: + # This assumes that the first item of past key values is a list of all the past keys, thus the + # shape 1 is the length of the past (this is the case for attention without window) + past_length = past_key_values[0][0].shape[1] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + model_inputs = { + "input_ids": input_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.pop("use_cache", True), + } + return model_inputs + + def get_input_embeddings(self) -> torch.nn.Module: + return self.model.tok_embeddings + + def set_input_embeddings(self, value: torch.nn.Module): + self.model.tok_embeddings = value + + def get_output_embeddings(self): + if self.model_config.weight_tying: + return self.model.tok_embeddings + else: + return self.model.output + + def set_output_embeddings(self, value: torch.nn.Module): + if self.model_config.weight_tying: + self.model.tok_embeddings = value + else: + self.model.output = value + + def tie_weights(self): + """ + Copied from OLMo (description below). I removed it and the results just became garbage, so this pass is needed. + This function is intentionally left as a no-op. + Weight tying is handled as follows: + - When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration. + See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`. + - When computing logits, the `wte` weights are used directly if `weight_tying` is enabled. + See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method. + Therefore, there is no need to explicitly tie the weights in this function. + """ + pass + + def resize_token_embeddings( + self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None + ) -> torch.nn.Embedding: + raise NotImplementedError + + +# Register the model so that it is available for transformer pipelines, auto-loading, etc. +AutoModelForCausalLM.register(OpenLMConfig, OpenLMForCausalLM) diff --git a/open_lm/hf/tokenization_openlm.py b/open_lm/hf/tokenization_openlm.py new file mode 100644 index 0000000..e8abdd6 --- /dev/null +++ b/open_lm/hf/tokenization_openlm.py @@ -0,0 +1,18 @@ +# Follows OLMo's HF template + +from transformers import AutoTokenizer, PreTrainedTokenizerFast + +from open_lm.hf.configuration_openlm import OpenLMConfig + + +class OpenLMTokenizerFast(PreTrainedTokenizerFast): + # Note: OpenLM's tokenizer is already a wrapper around huggingface. This is potentially unnecessary. + pass + + # def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + # # This is required to make the implementation complete. + # pass + + +# Register the tokenizer class so that it is available for transformer pipelines, auto-loading etc. +AutoTokenizer.register(OpenLMConfig, fast_tokenizer_class=OpenLMTokenizerFast) diff --git a/open_lm/infer_proportions.py b/open_lm/infer_proportions.py deleted file mode 100644 index 1dadd64..0000000 --- a/open_lm/infer_proportions.py +++ /dev/null @@ -1,57 +0,0 @@ -import torch -import numpy as np - -from extra_funcs import train_classifier, proj_simplex, round_preserving_sum, sample_and_rename_files, inference, del_dir - -def comparison(x, xcandidate): - - list1 = round_preserving_sum(x.tolist()) - list2 = round_preserving_sum(xcandidate.tolist()) - list = [list1, list2] - - sample_and_rename_files(list) - - train_classifier() - - result = inference() - - del_dir("/workspace/youssef/lrz/logs/RedPajama/prop") - del_dir("/workspace/youssef/lrz/datasets/prop/train") - - return result - - -def gradientless_descent(N=6, num_iter=200, radius = 0.2, alpha=0.5): - - #For measuring error - xorig = np.array([0.0325,0.1575,0.6775,0.0525,0.0275,0.0525]) - - # initialize x with equal probability - x = np.ones(N)/N - - error = [] - prop = [] - - for i in range(num_iter): - - stepsize = 1/(i+1)**alpha - # choose random direction with radius R - dir = np.random.randn(N) - dir = dir/np.linalg.norm(dir)*radius*stepsize - - xcandidate = proj_simplex( x + dir ) - - # compare x with x+dir and update x - if comparison(x, xcandidate) == 1: - x = xcandidate - - print(i, np.linalg.norm(x-xorig), x) - error.append(np.linalg.norm(x-xorig)) - prop.append(x) - - torch.save(error, "error.pt") - torch.save(prop, "prop.pt") - return x - -if __name__ == "__main__": - gradientless_descent() diff --git a/open_lm/main2.py b/open_lm/main2.py deleted file mode 100644 index 55863f8..0000000 --- a/open_lm/main2.py +++ /dev/null @@ -1,1034 +0,0 @@ -import atexit -import logging -import os -import re -import sys -import random -from datetime import datetime -import functools -import numpy as np -from pathlib import Path -import json -import traceback - -import fsspec -import torch -from torch import optim -from torch.cuda.amp import GradScaler - -import torch.distributed as dist - -from open_lm.data import sample_chunk - -from torch.distributed.fsdp import ( - FullyShardedDataParallel as FSDP, - MixedPrecision, - BackwardPrefetch, - ShardingStrategy, - FullStateDictConfig, - StateDictType, - CPUOffload, -) -from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy - -from open_lm.data import proc_token -from open_lm.model import Block -from open_lm.losses import CrossEntropyLossWithZLoss -from open_lm.utils.averaging_utils import ModelAverager - -try: - import wandb -except ImportError: - wandb = None - -try: - import torch.utils.tensorboard as tensorboard -except ImportError: - tensorboard = None - -from open_lm.model import create_model -from open_lm.model import create_classif_model - -from open_lm.utils.transformers.hf_wrapper import create_wrapped_hf_model -from open_lm.data import get_data, get_wds_dataset -from open_lm.distributed import is_master, init_distributed_device, broadcast_object -from open_lm.logger import setup_logging -from open_lm.params import parse_args -from open_lm.scheduler import cosine_lr, const_lr -from open_lm.train import train_one_epoch -from open_lm.evaluate import evaluate_loop -from open_lm.file_utils import ( - pt_load, - check_exists, - start_sync_process, - remote_sync_with_expon_backoff, - get_metadata_file, - get_string_for_epoch, - log_num_checkpoints, - terminate_sync_process, -) - - -LATEST_CHECKPOINT_NAME = "epoch_latest.pt" - - -def random_seed(seed=42, rank=0): - torch.manual_seed(seed + rank) - np.random.seed(seed + rank) - random.seed(seed + rank) - - -def natural_key(string_): - """See http://www.codinghorror.com/blog/archives/001018.html""" - return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())] - - -def get_latest_checkpoint(path: str): - is_s3 = path.startswith("s3") - fs, root_path = fsspec.core.url_to_fs(path) - checkpoints = fs.glob(os.path.join(root_path, "epoch_*.pt")) - if checkpoints: - checkpoints = sorted(checkpoints, key=natural_key) - return f"s3://{checkpoints[-1]}" if is_s3 else checkpoints[-1] - - return None - - -def get_state_dict(name): - checkpoint = pt_load(name, map_location="cpu") - if "epoch" in checkpoint: - sd = checkpoint["state_dict"] - if next(iter(sd.items()))[0].startswith("module"): - sd = {k[len("module.") :]: v for k, v in sd.items()} - else: - sd = checkpoint - return sd - - -def load_model(args, model, different_seed=False): - checkpoint = pt_load(args.resume, map_location="cpu") - if "epoch" in checkpoint: - if not different_seed and "shard_shuffle_seed" in checkpoint: - pretrained_seed = checkpoint["shard_shuffle_seed"] - assert ( - pretrained_seed == args.seed - ), f"This checkpoint was trained with a random seed of {pretrained_seed}. Since this seed affects shard shuffling, resuming training must use the same seed." - else: - if different_seed: - message = "Resuming a checkpoint without checking that the seed match. This means that training might not be reproducible." - else: - message = "Resuming a checkpoint that does not have a seed saved. This means that the shards were not shuffled, so they will remain unshuffled." - logging.info(message) - pretrained_seed = None - - # resuming a train checkpoint w/ epoch and optimizer state - start_epoch = checkpoint["epoch"] - sd = checkpoint["state_dict"] - global_step = checkpoint.get("step", None) - if next(iter(sd.items()))[0].startswith("module"): - sd = {k[len("module.") :]: v for k, v in sd.items()} - if "_orig_mod" in next(iter(sd.items()))[0]: - sd = {k.replace("_orig_mod.", ""): v for k, v in sd.items()} - if args.fsdp: - model.load_state_dict(sd) - elif args.distributed: - model.module.load_state_dict(sd) - else: - model.load_state_dict(sd) - logging.info(f"=> resuming checkpoint '{args.resume}' (epoch {start_epoch})") - else: - # loading a bare (model only) checkpoint for fine-tune or evaluation - start_epoch, global_step = 0, 0 - pretrained_seed = None - model.load_state_dict(checkpoint) - logging.info(f"=> loaded checkpoint '{args.resume}' (epoch {start_epoch})") - return start_epoch, global_step, pretrained_seed - - -def load_avg_models(args, averagers): - checkpoint = pt_load(args.resume, map_location="cpu") - if "epoch" in checkpoint: - # resuming a train checkpoint w/ epoch and optimizer state - start_epoch = checkpoint["epoch"] - if averagers is not None: - for k in averagers.avgs_dict: - avg_sd = torch.load(args.resume.replace("epoch", k), map_location="cpu") - if next(iter(avg_sd.items()))[0].startswith("module"): - avg_sd = {k[len("module.") :]: v for k, v in avg_sd.items()} - if "_orig_mod" in next(iter(avg_sd.items()))[0]: - avg_sd = {k.replace("_orig_mod.", ""): v for k, v in avg_sd.items()} - averagers.avgs_dict[k].load_state_dict_avg(avg_sd) - logging.info( - f"=> resuming averager for {k} from checkpoint '{args.resume.replace('epoch', k)} (epoch {start_epoch})" - ) - return - - -def load_optimizer(args, model, optimizer, scaler): - potential_checkpoint = args.resume.replace("epoch_", "optimizer_") - if check_exists(potential_checkpoint): - checkpoint = pt_load(potential_checkpoint, map_location="cpu") - else: - checkpoint = pt_load(args.resume, map_location="cpu") - if "optimizer" in checkpoint: - if optimizer is not None: - osd = checkpoint["optimizer"] - if args.fsdp: - osd = FSDP.optim_state_dict_to_load(model=model, optim=optimizer, optim_state_dict=osd) - optimizer.load_state_dict(osd) - logging.info(f"=> resuming optimizer") - if scaler is not None and "scaler" in checkpoint: - scaler.load_state_dict(checkpoint["scaler"]) - else: - logging.info(f"=> WARNING: not resuming optimizer.") - - -def load_data_chunks(args): - checkpoint = pt_load(args.resume, map_location="cpu") - if "next_shard_per_source" in checkpoint and "samples_seen" in checkpoint: - return checkpoint["next_shard_per_source"], checkpoint["samples_seen"] - else: - logging.info( - "=> WARNING: tried to resume a checkpoint without data loading info. Re-starting data loading from the " - "first shard." - ) - return [0 for _ in range(len(args.dataset_manifest))], 0 - - -def save_checkpoint( - args, - model, - optimizer, - scaler, - completed_epoch, - evaluation_metrics, - step, - is_final_checkpoint, - percentage_of_data_seen=-1.0, - next_shard_per_source=None, - samples_seen=None, - shard_shuffle_seed=None, - train_data_string=None, - averagers=None, - failed=False, -): - cpu_state, optim_state = None, None - if args.logs and args.logs.lower() != "none" and args.fsdp: - save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) - with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy): - cpu_state = model.state_dict() - optim_state = FSDP.optim_state_dict(model, optimizer) - if args.save_logs: - checkpoint_dict_model = { - "epoch": completed_epoch, - "name": args.name, - "state_dict": cpu_state if args.fsdp else model.state_dict(), - "evaluation_metrics": evaluation_metrics, - } - if next_shard_per_source is not None: - checkpoint_dict_model["next_shard_per_source"] = next_shard_per_source - - if samples_seen is not None: - checkpoint_dict_model["samples_seen"] = samples_seen - - if step is not None: - checkpoint_dict_model["step"] = step - - if shard_shuffle_seed is not None: - checkpoint_dict_model["shard_shuffle_seed"] = shard_shuffle_seed - - checkpoint_dict_opt = { - "epoch": completed_epoch, - "name": args.name, - "optimizer": optim_state if args.fsdp else optimizer.state_dict(), - "evaluation_metrics": evaluation_metrics, - } - - if scaler is not None: - checkpoint_dict_opt["scaler"] = scaler.state_dict() - - checkpoint_dict_stats = { - "epoch": completed_epoch, - "name": args.name, - "is_final_checkpoint": is_final_checkpoint, - "evaluation_metrics": evaluation_metrics, - "percentage_of_data_seen": percentage_of_data_seen, - } - if next_shard_per_source is not None: - checkpoint_dict_stats["next_shard_per_source"] = next_shard_per_source - - if samples_seen is not None: - checkpoint_dict_stats["samples_seen"] = samples_seen - - if step is not None: - checkpoint_dict_stats["step"] = step - - if shard_shuffle_seed is not None: - checkpoint_dict_stats["shard_shuffle_seed"] = shard_shuffle_seed - - if train_data_string is not None: - checkpoint_dict_stats["train_data_string"] = train_data_string - - prefixes = { - "epoch_": checkpoint_dict_model, - "optimizer_": checkpoint_dict_opt, - "stats_": checkpoint_dict_stats, - } - - if averagers is not None: - for k in averagers.avgs_dict: - prefixes[f"{k}_"] = averagers.avgs_dict[k].get_state_dict_avg() - if ( - completed_epoch == args.epochs - or is_final_checkpoint - or (args.save_frequency > 0 and (completed_epoch % args.save_frequency) == 0) - ): - for prefix in prefixes: - save_path = args.checkpoint_path if not failed else args.failed_checkpoint_path - path = os.path.join(save_path, f"{prefix}{completed_epoch}.pt") - print(f"Saving {prefix}{completed_epoch} in {path}...") - torch.save( - prefixes[prefix], - path, - ) - - if args.delete_previous_checkpoint: - for prefix in prefixes: - prev = os.path.join(args.checkpoint_path, f"{prefix}{completed_epoch - 1}.pt") - if os.path.exists(prev): - os.remove(prev) - - -def cleanup(sync_process, distributed=False): - if sync_process: - terminate_sync_process(sync_process) - if distributed and torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() - - -def main(args): - args = parse_args(args) - - requires_training = args.train_data or args.dataset_type == "synthetic" or args.dataset_manifest is not None - - if torch.cuda.is_available(): - # This enables tf32 on Ampere GPUs which is only 8% slower than - # float16 and almost as accurate as float32 - # This was a default in pytorch until 1.12 - torch.backends.cuda.matmul.allow_tf32 = True - torch.backends.cudnn.benchmark = True - torch.backends.cudnn.deterministic = False - - # fully initialize distributed device environment - device = init_distributed_device(args) - - assert ( - args.global_batch_size % args.world_size == 0 - ), f"Global batch size ({args.global_batch_size}) is not divisible by number of GPUs ({args.world_size}), and thus cannot be respected." - - args.per_gpu_batch_size = max(args.global_batch_size // args.world_size, 1) - if args.val_data is not None: - args.per_gpu_val_batch_size = max(args.global_val_batch_size // args.world_size, 1) - - if args.hf_model is not None and args.hf_seq_len is None: - raise ValueError("If passing --hf-model, must also pass --hf-seq-len to be used for training/fine-tuning.") - - if args.hf_model is not None and args.fsdp and args.hf_fsdp_block is None: - raise ValueError("If passing --hf-model and --fsdp, must also pass --hf-fspd-block.") - - if args.fsdp and not args.distributed: - raise ValueError(f"--fsdp can only be specified in distributed mode.") - - # get the name of the experiments - if args.name is None: - # sanitize model name for filesystem / uri use, easier if we don't use / in name as a rule? - model_name_safe = None - if args.hf_model is not None: - model_name_safe = args.hf_model.replace("/", "-") - else: - if Path(args.model).is_file(): - model_name_safe = Path(args.model).stem.replace("/", "-") - else: - model_name_safe = args.model.replace("/", "-") - - date_str = datetime.now().strftime("%Y_%m_%d-%H_%M_%S") - if args.distributed: - # sync date_str from master to all ranks - date_str = broadcast_object(args, date_str) - args.name = "-".join( - [ - date_str, - f"model_{model_name_safe}", - f"lr_{args.lr}", - f"b_{args.per_gpu_batch_size}", # Per gpu to respect old naming convention - ] - ) - - resume_latest = args.resume == "latest" - log_base_path = os.path.join(args.logs, args.name) - args.log_path = None - if is_master(args, local=args.log_local): - os.makedirs(log_base_path, exist_ok=True) - log_filename = f"out-{args.rank}" if args.log_local else "out.log" - args.log_path = os.path.join(log_base_path, log_filename) - if os.path.exists(args.log_path) and not resume_latest: - raise ValueError(f"Experiment {args.log_path} already exists. Use --name to specify a new experiment.") - - # Setup text logger - args.log_level = logging.DEBUG if args.debug else logging.INFO - setup_logging(args.log_path, args.log_level) - - # Setup wandb, tensorboard, checkpoint logging - args.wandb = "wandb" in args.report_to or "all" in args.report_to - args.tensorboard = "tensorboard" in args.report_to or "all" in args.report_to - args.checkpoint_path = os.path.join(log_base_path, "checkpoints") - args.failed_checkpoint_path = os.path.join(log_base_path, "checkpoints_failed") - if is_master(args): - args.tensorboard_path = os.path.join(log_base_path, "tensorboard") if args.tensorboard else "" - for dirname in [args.tensorboard_path, args.checkpoint_path, args.failed_checkpoint_path]: - if dirname: - os.makedirs(dirname, exist_ok=True) - else: - args.tensorboard_path = "" - - if resume_latest: - resume_from = None - checkpoint_path = args.checkpoint_path - - # If using remote_sync, need to check the remote instead of the local checkpoints folder. - if args.remote_sync is not None: - checkpoint_path = os.path.join(args.remote_sync, args.name, "checkpoints") - - if is_master(args): - # Checking for existing checkpoint via master rank only. It is possible for - # different rank processes to see different files if a shared file-system is under - # stress, however it's very difficult to fully work around such situations. - if args.save_most_recent: - # if --save-most-recent flag is set, look for latest at a fixed filename - resume_from = os.path.join(checkpoint_path, "checkpoints", LATEST_CHECKPOINT_NAME) - if not os.path.exists(resume_from): - # If no latest checkpoint has been saved yet, don't try to resume - resume_from = None - else: - # otherwise, list checkpoint dir contents and pick the newest checkpoint - resume_from = get_latest_checkpoint(checkpoint_path) - if resume_from: - logging.info(f"Found latest resume checkpoint at {resume_from}.") - else: - logging.info(f"No latest resume checkpoint found in {checkpoint_path}.") - if args.distributed: - # sync found checkpoint path to all ranks - resume_from = broadcast_object(args, resume_from) - args.resume = resume_from - - if args.copy_codebase: - copy_codebase(args) - - # start the sync proces if remote-sync is not None - remote_sync_process = None - if is_master(args) and args.remote_sync is not None: - # first make sure it works - result = remote_sync_with_expon_backoff( - args.remote_sync_frequency, - os.path.join(args.logs, args.name), - os.path.join(args.remote_sync, args.name), - args.remote_sync_protocol, - ) - if result: - logging.info("remote sync successful.") - else: - raise ValueError("Remote sync failed.") - # if all looks good, start a process to do this every args.remote_sync_frequency seconds - remote_sync_process = start_sync_process( - args.remote_sync_frequency, - os.path.join(args.logs, args.name), - os.path.join(args.remote_sync, args.name), - args.remote_sync_protocol, - ) - remote_sync_process.start() - - # Handle cleanup even if open_lm crashes. - # TODO: For cases where main() is called as a functio, we need to call cleanup() manually. - # Right now, we do this manually in every case where main returns, but we should put main() in a wrapper and call - # cleanup() outside it, ideally. - atexit.register(cleanup, sync_process=remote_sync_process, distributed=args.distributed) - - if args.precision == "fp16": - logging.warning( - "It is recommended to use AMP mixed-precision instead of FP16. " - "FP16 support needs further verification and tuning, especially for train." - ) - - elif args.distributed: - logging.info( - f"Running in distributed mode with multiple processes. Device: {args.device}." - f"Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}." - ) - else: - logging.info(f"Running with a single process. Device {args.device}.") - - random_seed(args.seed, 0) - - model = None - if args.hf_model is not None: - model = create_wrapped_hf_model(args) - else: - # Optional: Use meta device - with torch.device("meta" if args.experimental_meta_device and args.fsdp else args.device): - if args.classification: - model = create_classif_model(args) - else: - model = create_model(args) - - args.vocab_size = model.vocab_size - args.seq_len = model.seq_len - if args.train_num_samples is not None: - args.train_num_samples //= args.seq_len - if args.val_num_samples is not None: - if args.val_num_samples // args.seq_len == 0: - raise ValueError( - f"number of requested evaluation val_num_samples (tokens): {args.val_num_samples} is less than seq_len: {args.seq_len}" - ) - args.val_num_samples //= args.seq_len - - averagers = None - random_seed(args.seed, args.rank) - - if args.grad_checkpointing: - model.set_grad_checkpointing() - - if args.distributed: - if args.fsdp: - transformer_layer_cls = None - - if args.hf_model is not None: - # retrive the user specified block class for fsdp - for _, target_cls in model.named_modules(): - if args.hf_fsdp_block in type(target_cls).__name__: - transformer_layer_cls = {type(target_cls)} - break - - if transformer_layer_cls is None: - print(f"--hf-fsdp-block {args.hf_fsdp_block} not found in --hf-model {args.hf_model}") - return -1 - - else: - transformer_layer_cls = {Block} - # from https://pytorch.org/blog/efficient-large-scale-training-with-pytorch/ - transformer_auto_wrapper_policy = functools.partial( - transformer_auto_wrap_policy, - transformer_layer_cls=transformer_layer_cls, - ) - # tries to follow gopher... - mp_policy = None - if args.fsdp_amp: - print("=> using bfloat16 params as part of fsdp amp policy.") - mp_policy = MixedPrecision( - param_dtype=torch.bfloat16, - reduce_dtype=torch.float32, - buffer_dtype=torch.bfloat16, - ) - elif args.fsdp_pure_bf16: - print("=> using pure bfloat16 params as part of fsdp amp policy.") - mp_policy = MixedPrecision( - param_dtype=torch.bfloat16, - reduce_dtype=torch.bfloat16, - buffer_dtype=torch.bfloat16, - ) - - if args.rank == 0: - print(f"Before FSDP parameter num: {sum(p.numel() for p in model.parameters()):,}") - print(f"Before FSDP {torch.cuda.memory_allocated()/1024**3:.3} GB") - - fsdp_kwargs = {} - assert not ( - args.fsdp_hybrid and args.fsdp_hybrid_o2 - ), "Only --fsdp-hybrid or --fsdp-hybrid-o2 should be set." - if args.fsdp_backward_prefetch: - fsdp_kwargs["backward_prefetch"] = BackwardPrefetch.BACKWARD_PRE - if args.fsdp_hybrid: - fsdp_kwargs["sharding_strategy"] = ShardingStrategy.HYBRID_SHARD - if args.fsdp_hybrid_o2: - fsdp_kwargs["sharding_strategy"] = ShardingStrategy._HYBRID_SHARD_ZERO2 - print("=> FSDP kwargs: ", fsdp_kwargs) - - # Initialize FSDP. Use the same seed across workers to ensure reset_parameters is the same across workers. - random_seed(args.seed, rank=0) - model = FSDP( - model, - auto_wrap_policy=transformer_auto_wrapper_policy, - device_id=device, - mixed_precision=mp_policy, - cpu_offload=CPUOffload(offload_params=args.fsdp_cpu_offload), - use_orig_params=args.fsdp_use_orig_params, - limit_all_gathers=args.fsdp_limit_all_gathers, - **fsdp_kwargs, - ) - - print(f"After FSDP parameter num: {sum(p.numel() for p in model.parameters()):,} on rank {args.rank}") - print(f"After FSDP {torch.cuda.memory_allocated()/1024**3:.3} GB on rank {args.rank}") - else: - ddp_args = {} - if args.ddp_static_graph: - # this doesn't exist in older PyTorch, arg only added if enabled - ddp_args["static_graph"] = True - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], **ddp_args) - if args.averagers is not None: - averagers = ModelAverager(model, args.averagers) - if args.resume is not None and averagers is not None: - load_avg_models(args, averagers) - - if is_master(args): - logging.info(f"Model (has {sum(p.numel() for p in model.parameters() if p.requires_grad)} parameters):") - logging.info(f"{str(model)}") - logging.info("Params:") - params_file = os.path.join(args.logs, args.name, "params.txt") - with open(params_file, "w") as f: - for name in sorted(vars(args)): - val = getattr(args, name) - logging.info(f" {name}: {val}") - f.write(f"{name}: {val}\n") - - # optionally resume model from a checkpoint - start_epoch, global_step = 0, 0 - shard_shuffle_seed = args.seed - if args.resume is not None: - start_epoch, global_step, shard_shuffle_seed = load_model(args, model) - - elif args.pretrained is not None: - print("=> loading from a pre-trained model.") - args.resume = args.pretrained - # this flag continues training from the pre-trained model. - if args.load_pretrained_state: - start_epoch, global_step, shard_shuffle_seed = load_model(args, model) - else: - load_model(args, model, different_seed=True) - args.resume = None - elif args.average is not None: - num_models_to_average = len(args.average) - print( - "=> Averaging models: ", - args.average, - " with coefficients: ", - args.average_coefficients, - ) - assert num_models_to_average > 1, "num_models_to_average must be > 1 - else use --pretrained" - if args.average_coefficients is None: - args.average_coefficients = [1.0 / num_models_to_average] * num_models_to_average - else: - assert len(args.average_coefficients) == num_models_to_average - state_dict = {k: v * args.average_coefficients[0] for k, v in get_state_dict(args.average[0]).items()} - for i in range(1, num_models_to_average): - state_dict_i = get_state_dict(args.average[i]) - for k in state_dict: - state_dict[k] = state_dict[k] + state_dict_i[k] * args.average_coefficients[i] - model.load_state_dict(state_dict) - - # Put the shard shuffle seed back into args (this is done for compatibility with older, non shuffling versions) - args.shard_shuffle_seed = shard_shuffle_seed - - if requires_training and global_step is None: - raise ValueError("Key 'step' not found in checkpoint, but required for training.") - - # Add data chunk when resuming (only for dataset without resampling) - next_shard_per_source = [0 for _ in range(len(args.dataset_manifest))] if args.dataset_manifest is not None else 0 - samples_seen = 0 - if args.resume is not None and args.dataset_manifest is not None: - next_shard_per_source, samples_seen = load_data_chunks(args) - if samples_seen >= args.train_num_samples * args.epochs: - raise RuntimeError("Loaded a checkpoint which has already seen the desired number of tokens.") - - # create optimizer and scaler - optimizer = None - scaler = None - - if requires_training: - named_parameters = list(model.named_parameters()) - no_decay_params = [] # to be potentially used later - params = [p for n, p in named_parameters if p.requires_grad] - - optimizer = optim.AdamW( - [ - {"params": no_decay_params, "weight_decay": 0.0}, - {"params": params, "weight_decay": args.wd}, - ], - lr=args.lr, - betas=(args.beta1, args.beta2), - eps=args.eps, - ) - scaler = None - if args.precision == "amp": - assert not args.fsdp, "FSDP not supported with amp, only amp_bfloat16" - scaler = GradScaler() - - # initialize datasets - # use tokenizer=None because the data is already pre-tokenized. - - data = get_data( - args, - epoch=start_epoch, - tokenizer=None, - skip_train=args.dataset_manifest is not None, - floor=args.dataset_manifest is not None, - ) - - if args.target_mask_left is not None: - # tokens handled with same modulo in dataloading - args.target_mask_left = proc_token(args.target_mask_left, args.vocab_size) - - if args.target_mask_individual is not None: - # tokens handled with same modulo in dataloading - args.target_mask_individual = proc_token(args.target_mask_individual, args.vocab_size) - - if args.torchcompile: - logging.info("Compiling model...") - model = torch.compile(model) - if averagers is not None: - logging.info("Compiling averagers...") - for k in averagers.avgs_dict: - averagers.avgs_dict[k].av_model = torch.compile(averagers.avgs_dict[k].av_model) - - # optionally resume optimizer from a checkpoint - # this needs to be after torchcompile - if args.resume is not None: - load_optimizer(args, model, optimizer, scaler) - - # create scheduler if train - scheduler = None - if requires_training: - if args.dataset_manifest is not None: - total_steps = (args.train_num_samples * args.epochs) // args.global_batch_size - else: - total_steps = (data["train"].dataloader.num_batches) * args.epochs - - if args.lr_scheduler == "cosine": - scheduler = cosine_lr( - optimizer, - args.lr, - args.warmup, - total_steps, - args.lr_cooldown_end, - args.force_min_lr, - ) - elif args.lr_scheduler == "const": - scheduler = const_lr( - optimizer, - args.lr, - args.warmup, - # total_steps, - # args.lr_cooldown_end, - # args.force_min_lr, - ) - else: - raise ValueError(f"Unknown scheduler, {args.lr_scheduler}. Available options are: cosine, const.") - - # determine if this worker should save logs and checkpoints. only do so if it is rank == 0 - args.save_logs = args.logs and args.logs.lower() != "none" and is_master(args) - writer = None - if args.save_logs and args.tensorboard: - assert tensorboard is not None, "Please install tensorboard." - writer = tensorboard.SummaryWriter(args.tensorboard_path) - if args.wandb and is_master(args): - assert wandb is not None, "Please install wandb." - logging.debug("Starting wandb.") - - wandb.init( - project=args.wandb_project_name, - name=args.name, - notes=args.wandb_notes, - tags=[], - resume=None, - config=vars(args), - ) - if args.debug: - wandb.watch(model, log="all") - wandb.save(params_file) - logging.debug("Finished loading wandb.") - - if not requires_training: - if not args.resume: - logging.info("No training required, exiting.") - cleanup(remote_sync_process, args.distributed) - return - logging.info("No training required, evaluating instead.") - checkpoint_root = os.path.dirname(args.resume) - - if averagers is not None: - k = next(iter(averagers.avgs_dict.keys())) - logging.info(f"=> evaluation avg {k}") - model = averagers.avgs_dict[k].av_model - metrics = evaluate_loop(model, data["val_list"], start_epoch, args, writer) - metrics["average"] = k if averagers is not None else "none" - - if is_master(args): - with fsspec.open(os.path.join(checkpoint_root, "results.jsonl"), "a") as f: - f.write(f"{json.dumps(metrics)}\n") - - cleanup(remote_sync_process, args.distributed) - return - - loss = torch.nn.CrossEntropyLoss() - if args.z_loss_coefficient != 0.0: - if is_master(args): - logging.info("Using CrossEntropyLossWithZLoss.") - loss = CrossEntropyLossWithZLoss(args.z_loss_coefficient) - - if args.dataset_manifest: - log_num_checkpoints(total_steps, args) - - # Only enter training loop if there are steps to be done. - done_training = global_step >= total_steps - epoch = start_epoch - num_ckpt_too_few_tokens = 0 - while not done_training: - if is_master(args): - logging.info(f"Start epoch {epoch}") - - if args.dataset_manifest is not None: - assert not args.dataset_resampled, "dataset_manifest and dataset_resampled are mutually exclusive" - ( - train_data_string_per_source, - num_samples_per_source, - next_shard_per_source, - ) = get_string_for_epoch( - args.train_num_samples, - next_shard_per_source, - args.dataset_manifest, - args.train_data_mix_weights, - args.workers, - args.world_size, - multi_epoch=args.multiple_data_passes, - shard_shuffle_seed=args.shard_shuffle_seed, - ) - - # In the distributed case, make sure that all nodes receive the same string - if args.distributed: - all_source_strings = ["" for _ in range(args.world_size)] - dist.all_gather_object(all_source_strings, train_data_string_per_source) - assert all( - [x == train_data_string_per_source for x in all_source_strings] - ), "Dataset to train on is not the same across all nodes. This should not happen normally, unless there is an issue with shard shuffling during the dataset generation." - - if data["train"] is not None: - del data["train"] - args.train_data = train_data_string_per_source - - # Draw num_samples_per_source at most from dataset - rounded down to guarantee uniqueness. - data["train"] = get_wds_dataset( - args, True, epoch, force_num_samples=num_samples_per_source, data_key=args.data_key, floor=True - ) - - prev_step = global_step - if is_master(args): - logging.info(f"=> epoch {epoch}, training on {args.train_data}") - - if args.distributed: - dist.barrier() - - - #for batch in data["train"].dataloader: - # (texts, labels) = batch - # print(labels) - - # Get the dataloader and create an iterator - #dataloader = data["train"].dataloader - #data_iterator = iter(dataloader) - #batch = next(data_iterator) - - #(texts, labels) = batch - - #texts = torch.LongTensor(texts).to('cuda:0') - #labels = torch.LongTensor(labels).to('cuda:0') - #print(labels, labels.size()) - #labels = labels.unsqueeze(1).repeat(1, args.seq_len) - #print(labels, labels.size()) - - - #print(len(texts), texts.dtype, texts[0].size()) - #print(len(labels), labels.dtype, labels.size()) - #print(labels) - - - #print(len(labels), labels.dtype) - - #print("len(texts)= ", len(texts), " size(texts[0])= ", texts[0].size()) - #print(type(texts), type(texts[0])) - - #inputs, targets = sample_chunk(texts, args) - - #print("len(inputs)= ", len(inputs), " size(inputs[0])= ", inputs[0].size()) - #print(type(inputs), type(inputs[0])) - - #print("len(targets)= ", len(targets), " size(targets)= ", targets[0].size()) - #print(type(targets), type(targets[0])) - - #print("texts[0]= ", texts[0]) - #print("inputs[0]= ", inputs[0]) - #print("targets[0]= ", targets[0]) - - #out, _, _ = model(inputs) - - #print("len(out)= ", len(out), " size(out)= ", out.size()) - #print(type(out), type(out[0])) - #print("out[0]= ", out[0]) - - #device = next(model.parameters()).device - #print(inputs.device, device) - - #print("reshape") - #print("out reshaped: ", out.reshape(-1, args.vocab_size).size(), "targets reshaped: ", targets.reshape(-1).size()) - #print(targets.dtype) - #print(targets) - - #out = out[:, -1, :] - #print("out reshaped: ", out.reshape(-1, args.num_classes).size(), "lables reshaped: ", labels.reshape(-1).size()) - - success, global_step = train_one_epoch( - model, - data, - loss, - averagers=averagers, - epoch=epoch, - step=global_step, - optimizer=optimizer, - scaler=scaler, - scheduler=scheduler, - total_steps=total_steps, - args=args, - tb_writer=writer, - ) - - if args.distributed: - dist.barrier() - - done_training = global_step >= total_steps - steps_done_epoch = global_step - prev_step - samples_seen = samples_seen + steps_done_epoch * args.global_batch_size - - if not success: - logging.info("Training exiting due to NaN value") - break - - failed_ckpt = False - expected_steps = data["train"].dataloader.num_batches - if steps_done_epoch < (1 - args.data_tolerate_error_p) * expected_steps and not done_training: - failed_ckpt = True - num_ckpt_too_few_tokens += 1 - if is_master(args): - logging.warning( - f"Epoch {epoch}, tokens seen: {steps_done_epoch * args.global_batch_size * args.seq_len}, tokens expected: {expected_steps * args.global_batch_size * args.seq_len}, ratio: {steps_done_epoch / expected_steps}" - ) - - epoch = epoch + 1 - evaluation_metrics = [] - if "val_list" in data and (epoch % args.val_frequency == 0 or done_training): - # validate based on frequency and always validate the last checkpoint - try: - evaluation_metrics = evaluate_loop(model, data["val_list"], epoch, args, writer) - - if is_master(args): - with fsspec.open(os.path.join(args.checkpoint_path, "results.jsonl"), "a") as f: - f.write(f"{json.dumps(evaluation_metrics)}\n") - - except Exception as e: - if is_master(args): - logging.error(e) - logging.error(traceback.format_exc()) - logging.warning("evaluation failed! continuing to save_checkpoint") - - if is_master(args): - end_of_epoch_log = { - "epoch": epoch, - "tokens": (global_step + 1) * args.global_batch_size * args.seq_len, - "checkpoints_too_few_tokens": num_ckpt_too_few_tokens, - "percentage_of_data_seen": steps_done_epoch / expected_steps, - } - - if args.dataset_manifest is not None: - for i in range(len(next_shard_per_source)): - end_of_epoch_log[f"next_shard_{i}"] = next_shard_per_source[i] - end_of_epoch_log[f"dataset_pass_{i}"] = next_shard_per_source[i] // len( - get_metadata_file(args.dataset_manifest[i]) - ) - - for name, val in end_of_epoch_log.items(): - name = "train/" + name - if writer is not None: - writer.add_scalar(name, val, global_step) - if args.wandb: - assert wandb is not None, "Please install wandb." - wandb.log({name: val, "step": global_step, "tokens": end_of_epoch_log["tokens"]}) - - # Saving checkpoints. - save_checkpoint( - args, - model, - optimizer, - scaler, - epoch, - evaluation_metrics, - step=global_step, - is_final_checkpoint=done_training, - percentage_of_data_seen=1.0 * steps_done_epoch / expected_steps, - next_shard_per_source=next_shard_per_source if args.dataset_manifest is not None else None, - samples_seen=samples_seen if args.dataset_manifest is not None else None, - shard_shuffle_seed=args.shard_shuffle_seed, - train_data_string=train_data_string_per_source if args.dataset_manifest is not None else None, - averagers=averagers, - failed=failed_ckpt, - ) - - if num_ckpt_too_few_tokens > args.data_tolerate_num_ckpts: - raise RuntimeError( - f"{num_ckpt_too_few_tokens} checkpoints happened where the number of tokens seen was {1 - args.data_tolerate_error_p} of expected. This is likely due to transient errors e.g. reading from S3." - ) - - if done_training: - if is_master(args): - logging.info("Model has seen the desired number of tokens. Ending training.") - break - - if args.wandb and is_master(args): - wandb.finish() - - # run a final sync. - if remote_sync_process is not None: - logging.info("Final remote sync.") - terminate_sync_process(remote_sync_process) - result = remote_sync_with_expon_backoff( - args.remote_sync_frequency, - os.path.join(args.logs, args.name), - os.path.join(args.remote_sync, args.name), - args.remote_sync_protocol, - ) - if result: - logging.info("Final remote sync successful.") - else: - logging.info("Final remote sync failed.") - - # Final sync of all procs. - if args.distributed: - dist.barrier() - - cleanup(remote_sync_process, args.distributed) - return args - - -def copy_codebase(args): - from shutil import copytree, ignore_patterns - - new_code_path = os.path.join(args.logs, args.name, "code") - if os.path.exists(new_code_path): - print(f"Error. Experiment already exists at {new_code_path}. Use --name to specify a new experiment.") - return -1 - print(f"Copying codebase to {new_code_path}") - current_code_path = os.path.realpath(__file__) - for _ in range(3): - current_code_path = os.path.dirname(current_code_path) - copytree(current_code_path, new_code_path, ignore=ignore_patterns("log", "logs", "wandb")) - print("Done copying code.") - return 1 - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/open_lm/manifest.jsonl b/open_lm/manifest.jsonl deleted file mode 100644 index 1e4cc33..0000000 --- a/open_lm/manifest.jsonl +++ /dev/null @@ -1,200 +0,0 @@ -{"shard": "z", "num_sequences": 8192} -{"shard": "shard-0000001", "num_sequences": 8192} -{"shard": "shard-0000002", "num_sequences": 8192} -{"shard": "shard-0000003", "num_sequences": 8192} -{"shard": "shard-0000004", "num_sequences": 8192} -{"shard": "shard-0000005", "num_sequences": 8192} -{"shard": "shard-0000006", "num_sequences": 8192} -{"shard": "shard-0000007", "num_sequences": 8192} -{"shard": "shard-0000008", "num_sequences": 8192} -{"shard": "shard-0000009", "num_sequences": 8192} -{"shard": "shard-0000010", "num_sequences": 8192} -{"shard": "shard-0000011", "num_sequences": 8192} -{"shard": "shard-0000012", "num_sequences": 8192} -{"shard": "shard-0000013", "num_sequences": 8192} -{"shard": "shard-0000014", "num_sequences": 8192} -{"shard": "shard-0000015", "num_sequences": 8192} -{"shard": "shard-0000016", "num_sequences": 8192} -{"shard": "shard-0000017", "num_sequences": 8192} -{"shard": "shard-0000018", "num_sequences": 8192} -{"shard": "shard-0000019", "num_sequences": 8192} -{"shard": "shard-0000020", "num_sequences": 8192} -{"shard": "shard-0000021", "num_sequences": 8192} -{"shard": "shard-0000022", "num_sequences": 8192} -{"shard": "shard-0000023", "num_sequences": 8192} -{"shard": "shard-0000024", "num_sequences": 8192} -{"shard": "shard-0000025", "num_sequences": 8192} -{"shard": "shard-0000026", "num_sequences": 8192} -{"shard": "shard-0000027", "num_sequences": 8192} -{"shard": "shard-0000028", "num_sequences": 8192} -{"shard": "shard-0000029", "num_sequences": 8192} -{"shard": "shard-0000030", "num_sequences": 8192} -{"shard": "shard-0000031", "num_sequences": 8192} -{"shard": "shard-0000032", "num_sequences": 8192} -{"shard": "shard-0000033", "num_sequences": 8192} -{"shard": "shard-0000034", "num_sequences": 8192} -{"shard": "shard-0000035", "num_sequences": 8192} -{"shard": "shard-0000036", "num_sequences": 8192} -{"shard": "shard-0000037", "num_sequences": 8192} -{"shard": "shard-0000038", "num_sequences": 8192} -{"shard": "shard-0000039", "num_sequences": 8192} -{"shard": "shard-0000040", "num_sequences": 8192} -{"shard": "shard-0000041", "num_sequences": 8192} -{"shard": "shard-0000042", "num_sequences": 8192} -{"shard": "shard-0000043", "num_sequences": 8192} -{"shard": "shard-0000044", "num_sequences": 8192} -{"shard": "shard-0000045", "num_sequences": 8192} -{"shard": "shard-0000046", "num_sequences": 8192} -{"shard": "shard-0000047", "num_sequences": 8192} -{"shard": "shard-0000048", "num_sequences": 8192} -{"shard": "shard-0000049", "num_sequences": 8192} -{"shard": "shard-0000050", "num_sequences": 8192} -{"shard": "shard-0000051", "num_sequences": 8192} -{"shard": "shard-0000052", "num_sequences": 8192} -{"shard": "shard-0000053", "num_sequences": 8192} -{"shard": "shard-0000054", "num_sequences": 8192} -{"shard": "shard-0000055", "num_sequences": 8192} -{"shard": "shard-0000056", "num_sequences": 8192} -{"shard": "shard-0000057", "num_sequences": 8192} -{"shard": "shard-0000058", "num_sequences": 8192} -{"shard": "shard-0000059", "num_sequences": 8192} -{"shard": "shard-0000060", "num_sequences": 8192} -{"shard": "shard-0000061", "num_sequences": 8192} -{"shard": "shard-0000062", "num_sequences": 8192} -{"shard": "shard-0000063", "num_sequences": 8192} -{"shard": "shard-0000064", "num_sequences": 8192} -{"shard": "shard-0000065", "num_sequences": 8192} -{"shard": "shard-0000066", "num_sequences": 8192} -{"shard": "shard-0000067", "num_sequences": 8192} -{"shard": "shard-0000068", "num_sequences": 8192} -{"shard": "shard-0000069", "num_sequences": 8192} -{"shard": "shard-0000070", "num_sequences": 8192} -{"shard": "shard-0000071", "num_sequences": 8192} -{"shard": "shard-0000072", "num_sequences": 8192} -{"shard": "shard-0000073", "num_sequences": 8192} -{"shard": "shard-0000074", "num_sequences": 8192} -{"shard": "shard-0000075", "num_sequences": 8192} -{"shard": "shard-0000076", "num_sequences": 8192} -{"shard": "shard-0000077", "num_sequences": 8192} -{"shard": "shard-0000078", "num_sequences": 8192} -{"shard": "shard-0000079", "num_sequences": 8192} -{"shard": "shard-0000080", "num_sequences": 8192} -{"shard": "shard-0000081", "num_sequences": 8192} -{"shard": "shard-0000082", "num_sequences": 8192} -{"shard": "shard-0000083", "num_sequences": 8192} -{"shard": "shard-0000084", "num_sequences": 8192} -{"shard": "shard-0000085", "num_sequences": 8192} -{"shard": "shard-0000086", "num_sequences": 8192} -{"shard": "shard-0000087", "num_sequences": 8192} -{"shard": "shard-0000088", "num_sequences": 8192} -{"shard": "shard-0000089", "num_sequences": 8192} -{"shard": "shard-0000090", "num_sequences": 8192} -{"shard": "shard-0000091", "num_sequences": 8192} -{"shard": "shard-0000092", "num_sequences": 8192} -{"shard": "shard-0000093", "num_sequences": 8192} -{"shard": "shard-0000094", "num_sequences": 8192} -{"shard": "shard-0000095", "num_sequences": 8192} -{"shard": "shard-0000096", "num_sequences": 8192} -{"shard": "shard-0000097", "num_sequences": 8192} -{"shard": "shard-0000098", "num_sequences": 8192} -{"shard": "shard-0000099", "num_sequences": 8192} -{"shard": "shard-0000100", "num_sequences": 8192} -{"shard": "shard-0000101", "num_sequences": 8192} -{"shard": "shard-0000102", "num_sequences": 8192} -{"shard": "shard-0000103", "num_sequences": 8192} -{"shard": "shard-0000104", "num_sequences": 8192} -{"shard": "shard-0000105", "num_sequences": 8192} -{"shard": "shard-0000106", "num_sequences": 8192} -{"shard": "shard-0000107", "num_sequences": 8192} -{"shard": "shard-0000108", "num_sequences": 8192} -{"shard": "shard-0000109", "num_sequences": 8192} -{"shard": "shard-0000110", "num_sequences": 8192} -{"shard": "shard-0000111", "num_sequences": 8192} -{"shard": "shard-0000112", "num_sequences": 8192} -{"shard": "shard-0000113", "num_sequences": 8192} -{"shard": "shard-0000114", "num_sequences": 8192} -{"shard": "shard-0000115", "num_sequences": 8192} -{"shard": "shard-0000116", "num_sequences": 8192} -{"shard": "shard-0000117", "num_sequences": 8192} -{"shard": "shard-0000118", "num_sequences": 8192} -{"shard": "shard-0000119", "num_sequences": 8192} -{"shard": "shard-0000120", "num_sequences": 8192} -{"shard": "shard-0000121", "num_sequences": 8192} -{"shard": "shard-0000122", "num_sequences": 8192} -{"shard": "shard-0000123", "num_sequences": 8192} -{"shard": "shard-0000124", "num_sequences": 8192} -{"shard": "shard-0000125", "num_sequences": 8192} -{"shard": "shard-0000126", "num_sequences": 8192} -{"shard": "shard-0000127", "num_sequences": 8192} -{"shard": "shard-0000128", "num_sequences": 8192} -{"shard": "shard-0000129", "num_sequences": 8192} -{"shard": "shard-0000130", "num_sequences": 8192} -{"shard": "shard-0000131", "num_sequences": 8192} -{"shard": "shard-0000132", "num_sequences": 8192} -{"shard": "shard-0000133", "num_sequences": 8192} -{"shard": "shard-0000134", "num_sequences": 8192} -{"shard": "shard-0000135", "num_sequences": 8192} -{"shard": "shard-0000136", "num_sequences": 8192} -{"shard": "shard-0000137", "num_sequences": 8192} -{"shard": "shard-0000138", "num_sequences": 8192} -{"shard": "shard-0000139", "num_sequences": 8192} -{"shard": "shard-0000140", "num_sequences": 8192} -{"shard": "shard-0000141", "num_sequences": 8192} -{"shard": "shard-0000142", "num_sequences": 8192} -{"shard": "shard-0000143", "num_sequences": 8192} -{"shard": "shard-0000144", "num_sequences": 8192} -{"shard": "shard-0000145", "num_sequences": 8192} -{"shard": "shard-0000146", "num_sequences": 8192} -{"shard": "shard-0000147", "num_sequences": 8192} -{"shard": "shard-0000148", "num_sequences": 8192} -{"shard": "shard-0000149", "num_sequences": 8192} -{"shard": "shard-0000150", "num_sequences": 8192} -{"shard": "shard-0000151", "num_sequences": 8192} -{"shard": "shard-0000152", "num_sequences": 8192} -{"shard": "shard-0000153", "num_sequences": 8192} -{"shard": "shard-0000154", "num_sequences": 8192} -{"shard": "shard-0000155", "num_sequences": 8192} -{"shard": "shard-0000156", "num_sequences": 8192} -{"shard": "shard-0000157", "num_sequences": 8192} -{"shard": "shard-0000158", "num_sequences": 8192} -{"shard": "shard-0000159", "num_sequences": 8192} -{"shard": "shard-0000160", "num_sequences": 8192} -{"shard": "shard-0000161", "num_sequences": 8192} -{"shard": "shard-0000162", "num_sequences": 8192} -{"shard": "shard-0000163", "num_sequences": 8192} -{"shard": "shard-0000164", "num_sequences": 8192} -{"shard": "shard-0000165", "num_sequences": 8192} -{"shard": "shard-0000166", "num_sequences": 8192} -{"shard": "shard-0000167", "num_sequences": 8192} -{"shard": "shard-0000168", "num_sequences": 8192} -{"shard": "shard-0000169", "num_sequences": 8192} -{"shard": "shard-0000170", "num_sequences": 8192} -{"shard": "shard-0000171", "num_sequences": 8192} -{"shard": "shard-0000172", "num_sequences": 8192} -{"shard": "shard-0000173", "num_sequences": 8192} -{"shard": "shard-0000174", "num_sequences": 8192} -{"shard": "shard-0000175", "num_sequences": 8192} -{"shard": "shard-0000176", "num_sequences": 8192} -{"shard": "shard-0000177", "num_sequences": 8192} -{"shard": "shard-0000178", "num_sequences": 8192} -{"shard": "shard-0000179", "num_sequences": 8192} -{"shard": "shard-0000180", "num_sequences": 8192} -{"shard": "shard-0000181", "num_sequences": 8192} -{"shard": "shard-0000182", "num_sequences": 8192} -{"shard": "shard-0000183", "num_sequences": 8192} -{"shard": "shard-0000184", "num_sequences": 8192} -{"shard": "shard-0000185", "num_sequences": 8192} -{"shard": "shard-0000186", "num_sequences": 8192} -{"shard": "shard-0000187", "num_sequences": 8192} -{"shard": "shard-0000188", "num_sequences": 8192} -{"shard": "shard-0000189", "num_sequences": 8192} -{"shard": "shard-0000190", "num_sequences": 8192} -{"shard": "shard-0000191", "num_sequences": 8192} -{"shard": "shard-0000192", "num_sequences": 8192} -{"shard": "shard-0000193", "num_sequences": 8192} -{"shard": "shard-0000194", "num_sequences": 8192} -{"shard": "shard-0000195", "num_sequences": 8192} -{"shard": "shard-0000196", "num_sequences": 8192} -{"shard": "shard-0000197", "num_sequences": 8192} -{"shard": "shard-0000198", "num_sequences": 8192} -{"shard": "shard-0000199", "num_sequences": 8192} \ No newline at end of file diff --git a/open_lm/model.py b/open_lm/model.py index 3c00cc4..ba2dd1b 100644 --- a/open_lm/model.py +++ b/open_lm/model.py @@ -509,22 +509,23 @@ def create_model(args): def create_classif_model(args): model = Transformer(create_params(args)) - checkpoint = pt_load(args.classif_model_path, map_location="cpu") - model.load_state_dict(checkpoint["state_dict"]) + if args.classif_model_path is not None: + checkpoint = pt_load(args.classif_model_path, map_location="cpu") + model.load_state_dict(checkpoint["state_dict"]) dim = model.output.in_features model.output = nn.Linear(dim, args.num_classes, bias = False) - + return model -def test_classif_model(args, model_path): +def test_classif_model(args): model = Transformer(create_params(args)) dim = model.output.in_features model.output = nn.Linear(dim, args.num_classes, bias = False) - checkpoint = pt_load(model_path, map_location="cpu") + checkpoint = pt_load(args.classif_model_path, map_location="cpu") model.load_state_dict(checkpoint["state_dict"]) return model diff --git a/open_lm/params.py b/open_lm/params.py index f74fa89..1b5f79f 100644 --- a/open_lm/params.py +++ b/open_lm/params.py @@ -804,7 +804,8 @@ def parse_args(args): type=str, default=None, help="Path of the pretrained model to be finetuned for classification.", - ) + ) + add_model_args(parser) config = maybe_load_config(parser, args) diff --git a/open_lm/positional_embedding/__pycache__/__init__.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/__init__.cpython-310.pyc index 3a8f889c0fd95673df95bb5d4b6d5e8f6ef26714..926d8c8d8f24f9375bbc2b2ac6aad53f3868f05a 100644 GIT binary patch delta 19 ZcmZ3$xPXy6pO=@50SNZ~J~NSfDgZ751(pB+ delta 19 ZcmZ3$xPXy6pO=@50SG3QHcaH63IHiG1epK; diff --git a/open_lm/positional_embedding/__pycache__/head_rotary.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/head_rotary.cpython-310.pyc index 9904228a597d336ae6ffd9285468f8c4e42e62a6..9a508c1979ce1ae5897e26c0944f1e70e1e68b01 100644 GIT binary patch delta 20 acmaFJ_mGb}pO=@50SNZ~KC_WKj2!?!8wMHx delta 20 acmaFJ_mGb}pO=@50SG3SHf-b$V+Q~^RRtaZ diff --git a/open_lm/positional_embedding/__pycache__/llama_rotary.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/llama_rotary.cpython-310.pyc index 47159ae62c9e0a09fc02a8203f66a58a7ddf8142..8d8be0b0ed132a7bae7ea1ec7e860aa1bc364ead 100644 GIT binary patch delta 20 acmZ3ayGWNipO=@50SNZ~KC_W~x)=aDt_CUq delta 20 acmZ3ayGWNipO=@50SG3SHf-dcE(QQJ=mjnS diff --git a/open_lm/positional_embedding/__pycache__/none.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/none.cpython-310.pyc index 296bc4ea4ac192eb82999a89aaa36cd065ea7174..5dbdb82aee29f4e6ec1c7a8ea95405ab8f67b130 100644 GIT binary patch delta 20 acmcb_bcu;OpO=@50SNZ~KC_YAoe=;#4F%5t delta 20 acmcb_bcu;OpO=@50SG3QHf-c}X9NH>Km^hN diff --git a/open_lm/positional_embedding/__pycache__/rotary.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/rotary.cpython-310.pyc index d4bd893eeca9abaed40259b35b90aa022ad498de..6c775c685b78ccd63a968a1748e7ddc8851524c5 100644 GIT binary patch delta 184 zcmew>^H+vDpO=@50SNZ~KC_X#nTs)Mb3d0DBbyOWsK|KoYVK`}rjwI+ep#4;1Pnlg z4v^4fEm8)tAp?NW=m}Sb_*^5CJyV scJe)52?;wOKLUu0S%3rwgA8L4NTkSYG9RBTqxEDRJ}E}K$?kk+02EFln*aa+ delta 184 zcmew>^H+vDpO=@50SG3SHf-c>=3>0Hxt~jnkwK%orgdkQ@SnIe74t1uxPFV<*!|24}iE)jcd; zy?F4Vvp$PW#A6aa z+MY;wr4>8jSB9(+LhYsZf!wfLo#%^m?hwxEiVHJlpgBeXR5A!J~`&3@V+V15Mw6{!d77~gU5nZOdp?Tqbs z_Z@wLsZIN~Tab18yF1*#-G nsxD3O|K|&F2D@0FMf*&QlZ|Il3 delta 614 zcmZ8fJ#Q015M}SfUxx!I*eC`RBm=@2bWW^LP>}+PP+(C}ARU+NVz0Q{-OTREN(B{7 zDzpv!1Vn*Q3VsGkL9>5?Ul76085&Nyr+G7P-^|?YasN|4k=<_RoPTWf?prb=8&6ZJ zHNFd3vaNLTB&B@1Gp`O$UXz=IIC~s?AYFSsd~wOG?ZUh&nJ}{)Gy`Yfhm#H0P&I>= zcU?gnqn`Z{rle&mMh)*D)aVYF`HnqCZ)gNqe#VO&@+ z;et}RXeMRh65L=Z;SB=~5uRS-Wv;SHNT3Y7%ec18m12TAYuejm9MLyf( diff --git a/open_lm/utils/transformers/__pycache__/hf_config.cpython-310.pyc b/open_lm/utils/transformers/__pycache__/hf_config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6a0f302ae27e78753248a6980c896ea333914d1 GIT binary patch literal 1831 zcmb7FOK%%D5GJ_~tzL%fx@l9S0B@~wkb5?YAZ~&_fK)M%p0*7Z>m^rSdmjo(#ZaJ} z6d^2{S`_T3GFbE=ONh%ssifdM~%8R|qkNql$1GnduVbzH{Zta&*)s4Gu9hAMQANQ+4 zJfP%DB0|x5MMOt-S6)06(GxQ2{stDvh}tkOWGVE>v;Lz(8yh{InWCxF()tgJ%)pKA zB(0>LrI`d_|5Qn%(xR5)K~v|&bi{0UnyR$Yc9H-B*6xeNB84_rRgP9Xy`N z`q4KntT=2yDyxYNTR67g&Pl4K+V1ZNi2y${4E?4-YCtU^edzig42G=81r@9yOaDju z10-Y#$mdkOW7v|d=u2@^ zQEQXdnRGg)(r_(R3qtU#b6N8l7%%E6DCM#MTnctDtuA1^xgolcjeA>DClJ%)t%Pp2 z-||m3Z2IohDSJ9P=HG(P?pPYBG;hu>WM;hCoJzw1g|ZmSye)HNb5=vteuO4U}z?I6@?e+JlqdKR*JC zm#&VF7R_91nIB<|BxSXo=cXw25!83B^F~#Wo}=?Tfv7^Ek7o+389K$ zUa^%&$+Jihs3bZWb>hL5ET-osF$+kveb*E+!C)q`I+GCMda4y#Rrui8j+Q1hMp+-( zVmm;66i1#qz&jXA8@M;ng{d8G;cgGUT)$-_kTt4{=zxBJ-oAgs+nt-qQrpAfBD5B55308-3P>ZX7gBQ*ZF-73^_xx;sjG@ zbT=CPef_<^Z{mrGnuhBizx?O*UtZ9(f2G0bWnu6EZuw7Wn8x%_>*%hoQ{M=Uj_I1J zZ-!Q<`<4QQ;PGFx(IN6zUr_`JsPIqS9 zna->`+d1W)>dd)w#oE*E=?^tlWwlottMSsV>CR(ZXAO)Sd>-R7{4AT;HiXZsdkbuG zUw6;(H~2h1x2x|G#+4`PZccrHvuI5{t1kN;F1vo4V^C{v^7eK&4x$93smFq^2tSB8 zdl)kwVru@j-;;j$;NhcQ((NUm1j*)OU(({nqb`pgJiHx8>%m6T$m;kqlIyYPaFI>4 z1@{x~b$#J?N`Ddsag>!-dfkv`_M>isp&w?}gFq&2!fo_2i-Ej@4%b|rX|BOd z*ZPsRt}~q(uWYx(%gkifD{WVIE4;F0v(j(`idC5XN_VS2(k#ubJ<*!AtezvRPS!R? zpb@YVZhFecXc8?2kX=JmQ+-Px7*Q=T5|fz$=C`cB)qe`o4vbXaE2YLhh@6ynwY^HJ zQ=b{B=GjR#HFkCMYD@!gYdI_Dq*!^aHS1Yr)0cjd2r&t0!ls50%BJ&0o-a0}$AWf} z)g(^}DrP0w>w>nn=LJ!ac%FL+6Wv%O(%TR|3wV@xhx`np4N;*@OC7~aGqWTQ*Ryh4 zaZG`J4RP?icIZp#d14Bye{<{Fi&$(c`M99}3tz+TvqCVZXasP@)lWJSqmi|P0 ztuMbe#A(cAvvl6^v%c2FJYK0oyl)!I%MfJ9Vgz&Id0EXP0qBL)Z+PCbo*(9KM47-fa;eK@ zO-V>*cLfAXBs+>2Ss9X^a3L0H`9vX=Z61b&WagNj#als~6%%OamLzJnX6X&XXw-De ztm(6+dfUc}v1_3i$#W2}p;ck}hVGimt{BXOU9p(OO0P7x#LBFKXPMcoif4t@SRGHB zHP{56RW`|{@T^s*M}9MTpS{Iqd7U@5CKfby>Yz8LdXtztjouXAPcLW~&8vCz*_neC zXVII%%tG-zS3KV+p6B^2KgH+PjRkFeo?T!UUzOa`)oJ!NyTlyWzIlFzU1nF5eLTw- z*f;RqEQfd4I#1u=D?^?7P4>-K4cO2N%2r;4TrFlZIm^N9q5bbqh8=H{`>geX2OFEo zF_jf&3Ux^e-GI-^jXS%se0(DGD)j=%EOM z6NS4@tS!KPfJKTBW&I?;BP*Bh3Gx`(B%&H{0im7O@#S{bfI0_iNd9T6i_p>A<~yG9 ziL&rSp})pMnN@p|du<;POg2;SXI%3vxn$kq*})5$%!60{{&+Sm6Dte&Eo zeSNQ-PHt89^}$q%Rq52OF_=i~v<@sLSmh_?U?!b`A-4V3koKBWT({t`zk4vE$((e~ z;%X9$_Garr9Br)dNXBB-iPs&u>36xay13>i?M+VxeZJ&K{;bENHupjvZ6uq`)g|X@ zTm<7+o8!9^ga?W_4m^R=jys4nU>2QZlRLy;&VWI*(epPjrYMtMhOv|_=YE9vl}q3d z_d@1yBo16SYupJUXZ6$TOHQlRS{geGLCd3r*7LaSuPKC|;vmBL{cl~r?kq}pmW1P96&0_KYFB-j?g*)_^}e#5hmV!>67U?r2FdXYIi5ujH82ZZr;4< ztc$qgj0W6J(4olRxz$Tzb#^dUEIG{DZq6uiU-keRB8yhaasxX^!owC_?bD?-)M+hYeRAegE$AefRFi<69y$ z>~s_CcjY6#v!r;3WQ1rd+dNs6|yah~xECl_;otpf>Nf{wf_^z6nv*T*3o zAl1sXGdH=5!_5c+lyy7kda}#gfggJ63wdKj^q|J=ko%F>>*ljT-&aOfrET)YJwKGZ z`RVni2gv`w$oN@w|Lu+w#mNZ0mz=d;;@o%Ip!$&FpnmI+8_Iy?5^4k&11! zO_8Q@)+{MEq3pi632Q7qpyoX^tuMa9^#wVsrkRz)cq2%pT9a9%kj-iydek5TNwCpF zYML2Qluh3jczZWWkWB16KsU3V2mA$+T}WhRBym<|{5edKpp2?HJCX+xSfO02LWR;I z@jf*lP;-l#Kc9#(LUrnDh5QHooclxkKl%}=mP{XT=4*7YNHvSYH_%jXO zYmw4MBoeqOja<+MWz;cLFD-9d;!Ii|R3g+b@;PRl)dm=$NTq#lq!nt(w+-~vQv1ED z_SM9@%uKC7AJ`F6rc|e1Y9s15($bzmUJkyt{Bx74BB>2;R#_IrUU82aMQx&hqIT0f zA`Xy&l5$^Jxz}aT!^}ju0U=OHzWPovP4Q6e^)~j(>MFv>U7I^6cqe|2B9!3k!B7#i zhSJ(RbUv2;Ucg$^V0iJ`1DU--B@xrS1PJM%vCUb1(k$biGp3Pn+6K6YLWiMSzqP)y zD*bE6IrhKqJQhB}5ZY&t2aFk@_9o+Ifn!iLi__PY1>U-exkF|Z{2guHzK~k`Tp%}GHRX#XV{?bF` zh_{iFWk&0|TT?_<5f}U%`AmtAr5ZwMY8$oPwMTK}EnWbp)# z1;XDv@@u^p!u5ypX6s-FAW=%FXo!SSvHVl)=1vu#ag>25WOQqxzk^)}wHy%_%T%_Q zO^(C(YeI<%OBOhGQHOrz-tZxqJs)!+1Jcp;lg%;iQM5@1rYKbY z3z`Jgt34EN_I0tAA~Hk-32G}%T1&t&rJES+^y}THb5yLz+;-q|!#^l{Pw) zOzdh~lPI8;(sBS@&Pz@CYzjF;4YZhR)|ORro=t)NMG@Is4^Z8$t7z0?$xfH|FF%M; zn?ziAxGE24SRf!%vDiV8lcISQ#E%IrDF7L?Cq&ANXaHP$TK~)#Rx zBf&TN6(~1qP_2wA-4*C^h&uOcQ~W7D&h(d=zH`JHXSMgJ&Pt)}&A%rwq$n4){=3H| z^+F9K)UjL19lSr?9W(*fUfOf2RH75O=*_&clfuL5}rDP!*e)`v=?#$MbpJ zqB)frF?=40VO9zxh$>dFCab_)Qi(aM1u}?0V8r~H3FUw`QLd29L^!*W5^SNLe$QE4Eo1aVQUQFEkIDm7FwaxYQ- zDG@e#c{1Q3Ri&ePoj`(68>${9$V>{Vrl|Ub_(SSZSwQ>|HKf`iMw6BEY%8lBW?q6y z!dd+uHcg7R)><5gg50y9qKb+{R2U#Wp|uSX|DfIB2>Y4%GxfQu$rm#beG|~M{TKv& z5NxArN^{q!IV@=?@{;`(*U?ZNRj#73HL8!=0.0.22 tiktoken wandb diff --git a/requirements_test.txt b/requirements_test.txt index 61f15ce..8413123 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -3,4 +3,4 @@ pytest-cov==3.0.0 pytest-xdist==2.5.0 pytest==7.0.1 tensorboard==2.14.1 -llm-foundry>=0.4.0 +llm-foundry==0.9.0