From 6f60508e92cc822324938446e825fa9cddc0cf84 Mon Sep 17 00:00:00 2001
From: Youssef Mansour <youssefmansour@aucegypt.edu>
Date: Tue, 12 Nov 2024 15:48:16 +0100
Subject: [PATCH] update open_lm

---
 open_lm/__pycache__/__init__.cpython-310.pyc  |  Bin 139 -> 139 bytes
 open_lm/__pycache__/attention.cpython-310.pyc |  Bin 5816 -> 5820 bytes
 open_lm/__pycache__/data.cpython-310.pyc      |  Bin 21889 -> 21909 bytes
 .../__pycache__/distributed.cpython-310.pyc   |  Bin 2941 -> 2941 bytes
 open_lm/__pycache__/eval.cpython-310.pyc      |  Bin 1124 -> 0 bytes
 open_lm/__pycache__/eval3_seq.cpython-310.pyc |  Bin 0 -> 1837 bytes
 open_lm/__pycache__/evaluate.cpython-310.pyc  |  Bin 3939 -> 3939 bytes
 .../__pycache__/extra_funcs.cpython-310.pyc   |  Bin 0 -> 6320 bytes
 .../__pycache__/extra_funcs2.cpython-310.pyc  |  Bin 0 -> 6119 bytes
 .../__pycache__/file_utils.cpython-310.pyc    |  Bin 14764 -> 14764 bytes
 open_lm/__pycache__/logger.cpython-310.pyc    |  Bin 771 -> 771 bytes
 open_lm/__pycache__/losses.cpython-310.pyc    |  Bin 1150 -> 1150 bytes
 open_lm/__pycache__/main.cpython-310.pyc      |  Bin 23020 -> 23020 bytes
 open_lm/__pycache__/main2.cpython-310.pyc     |  Bin 23056 -> 0 bytes
 open_lm/__pycache__/meters.cpython-310.pyc    |  Bin 3443 -> 3443 bytes
 open_lm/__pycache__/model.cpython-310.pyc     |  Bin 15249 -> 15256 bytes
 open_lm/__pycache__/norms.cpython-310.pyc     |  Bin 5027 -> 5027 bytes
 open_lm/__pycache__/params.cpython-310.pyc    |  Bin 18964 -> 18964 bytes
 open_lm/__pycache__/precision.cpython-310.pyc |  Bin 651 -> 651 bytes
 open_lm/__pycache__/scheduler.cpython-310.pyc |  Bin 1867 -> 1867 bytes
 open_lm/__pycache__/train.cpython-310.pyc     |  Bin 9619 -> 9619 bytes
 open_lm/attention.py                          |    2 +-
 open_lm/data.py                               |   12 +-
 .../make_2048-checkpoint.py                   |  255 ++++
 open_lm/datapreprocess/make_2048.py           |    4 +-
 open_lm/datapreprocess/wiki_download.py       |    2 +-
 open_lm/eval.py                               |   44 -
 open_lm/eval2.py                              |   96 ++
 open_lm/eval3.py                              |  116 ++
 open_lm/eval3_prop.py                         |  213 ++++
 open_lm/eval3_prop_2048.py                    |  139 +++
 open_lm/eval3_varylength.py                   |  118 ++
 open_lm/eval3_varylength_2000.py              |  124 ++
 open_lm/eval4.py                              |  139 +++
 open_lm/eval5.py                              |  162 +++
 open_lm/eval_redpajama_seq.py                 |  167 +++
 open_lm/extra_funcs.py                        |  214 ----
 open_lm/hf/__init__.py                        |    3 +
 .../hf/__pycache__/__init__.cpython-310.pyc   |  Bin 0 -> 319 bytes
 .../configuration_openlm.cpython-310.pyc      |  Bin 0 -> 854 bytes
 .../modeling_openlm.cpython-310.pyc           |  Bin 0 -> 5904 bytes
 .../tokenization_openlm.cpython-310.pyc       |  Bin 0 -> 525 bytes
 open_lm/hf/configuration_openlm.py            |   24 +
 open_lm/hf/modeling_openlm.py                 |  194 ++++
 open_lm/hf/tokenization_openlm.py             |   18 +
 open_lm/infer_proportions.py                  |   57 -
 open_lm/main2.py                              | 1034 -----------------
 open_lm/manifest.jsonl                        |  200 ----
 open_lm/model.py                              |   11 +-
 open_lm/params.py                             |    3 +-
 .../__pycache__/__init__.cpython-310.pyc      |  Bin 160 -> 160 bytes
 .../__pycache__/head_rotary.cpython-310.pyc   |  Bin 1889 -> 1889 bytes
 .../__pycache__/llama_rotary.cpython-310.pyc  |  Bin 5794 -> 5794 bytes
 .../__pycache__/none.cpython-310.pyc          |  Bin 338 -> 338 bytes
 .../__pycache__/rotary.cpython-310.pyc        |  Bin 3709 -> 3709 bytes
 open_lm/run_bench.sh                          |   19 -
 open_lm/test_class.py                         |   76 ++
 open_lm/train_class.py                        |   68 ++
 .../__pycache__/__init__.cpython-310.pyc      |  Bin 145 -> 145 bytes
 .../averaging_utils.cpython-310.pyc           |  Bin 3758 -> 3758 bytes
 .../make_wds_manifest.cpython-310.pyc         |  Bin 2841 -> 2827 bytes
 open_lm/utils/llm_foundry_wrapper.py          |   11 +-
 .../__pycache__/__init__.cpython-310.pyc      |  Bin 158 -> 158 bytes
 .../__pycache__/hf_config.cpython-310.pyc     |  Bin 0 -> 1831 bytes
 .../__pycache__/hf_model.cpython-310.pyc      |  Bin 0 -> 7376 bytes
 .../__pycache__/hf_wrapper.cpython-310.pyc    |  Bin 1393 -> 1393 bytes
 requirements.txt                              |    2 +-
 requirements_test.txt                         |    2 +-
 68 files changed, 1938 insertions(+), 1591 deletions(-)
 delete mode 100644 open_lm/__pycache__/eval.cpython-310.pyc
 create mode 100644 open_lm/__pycache__/eval3_seq.cpython-310.pyc
 create mode 100644 open_lm/__pycache__/extra_funcs.cpython-310.pyc
 create mode 100644 open_lm/__pycache__/extra_funcs2.cpython-310.pyc
 delete mode 100644 open_lm/__pycache__/main2.cpython-310.pyc
 create mode 100644 open_lm/datapreprocess/.ipynb_checkpoints/make_2048-checkpoint.py
 delete mode 100644 open_lm/eval.py
 create mode 100644 open_lm/eval2.py
 create mode 100644 open_lm/eval3.py
 create mode 100644 open_lm/eval3_prop.py
 create mode 100644 open_lm/eval3_prop_2048.py
 create mode 100644 open_lm/eval3_varylength.py
 create mode 100644 open_lm/eval3_varylength_2000.py
 create mode 100644 open_lm/eval4.py
 create mode 100644 open_lm/eval5.py
 create mode 100644 open_lm/eval_redpajama_seq.py
 delete mode 100644 open_lm/extra_funcs.py
 create mode 100644 open_lm/hf/__init__.py
 create mode 100644 open_lm/hf/__pycache__/__init__.cpython-310.pyc
 create mode 100644 open_lm/hf/__pycache__/configuration_openlm.cpython-310.pyc
 create mode 100644 open_lm/hf/__pycache__/modeling_openlm.cpython-310.pyc
 create mode 100644 open_lm/hf/__pycache__/tokenization_openlm.cpython-310.pyc
 create mode 100644 open_lm/hf/configuration_openlm.py
 create mode 100644 open_lm/hf/modeling_openlm.py
 create mode 100644 open_lm/hf/tokenization_openlm.py
 delete mode 100644 open_lm/infer_proportions.py
 delete mode 100644 open_lm/main2.py
 delete mode 100644 open_lm/manifest.jsonl
 delete mode 100644 open_lm/run_bench.sh
 create mode 100644 open_lm/test_class.py
 create mode 100644 open_lm/train_class.py
 create mode 100644 open_lm/utils/transformers/__pycache__/hf_config.cpython-310.pyc
 create mode 100644 open_lm/utils/transformers/__pycache__/hf_model.cpython-310.pyc

diff --git a/open_lm/__pycache__/__init__.cpython-310.pyc b/open_lm/__pycache__/__init__.cpython-310.pyc
index 9d49405838d836d336fe2656c2509255956ca2ed..75193ebbf87a7cf626df5f96a330aaabc8ed2b59 100644
GIT binary patch
delta 19
ZcmeBX>}KT7=jG*M0D`^0&rIZQ001ia1y=w7

delta 19
YcmeBX>}KT7=jG*M0D?)S4HLN=046B}S^xk5

diff --git a/open_lm/__pycache__/attention.cpython-310.pyc b/open_lm/__pycache__/attention.cpython-310.pyc
index 99fd3482da02fac82f08af91512b371921862d92..8a3978d12d71e7a4fe8c8f18374b24c8326ad4e6 100644
GIT binary patch
delta 76
zcmdm?yGNHdpO=@50SNZ~K9i=kk(Y^w(QmQ<&kEW26oy)k8ip)}6h=uP&Sol#OJS>J
d&eN%3E>4|%k;k5~bF(0CJR>9D=0ZMq4geNp6g2<<

delta 72
zcmdm^yF-^ZpO=@50SG3QHl(R<<YnSv^qMTdvqCa9g`t+Ch9Qd~g)y6{C?<uimN`$S
ahPgO-@>L#t#`evkyzz{TT$@Yz+&KWBDiW^%

diff --git a/open_lm/__pycache__/data.cpython-310.pyc b/open_lm/__pycache__/data.cpython-310.pyc
index e5d3d2ace6ef8004cbe965fdb53d2b4233d24775..29bea0dbbe842bbdcb7e871d6599723fee3e1f67 100644
GIT binary patch
delta 712
zcmZva-%C?r7{~X0&)nA9){@ms$<2w@Tt8@{w#~(`o64}UDM5iIs35JB<F*QB;6*_f
zQt)5|6?IqQg~Z!M*Im^gV9rr$Ld6KpV3eUm?=#A)4xG<<p67kv=lP!RITJIOnt^@9
zZnqlLm7Mr7lzL&m=uTFs?4-KKniSH8oFRt+Op0-lMoy^MFd!ziaV;%!+TRsp&X_d#
zwXSLCx6~QGC_gUj7#tbu%l4--gY?Xuf<wBhe6T#d4a0EEuF;8oB`nD1>L%b5_10WR
zUe4Fd3v9~S+RuQS?t33%Lk8<QfunTOw}i0lt$z+g$=oo2kW4iMfHU$z<GKR{y3_Ji
zG1EF;v7|H<=x6I=EYeh45Z&@k+i9RED?_zFo{oh>E$`GGoj2->j1C3ARiEW}nGs}s
zWUMd>j4$*gd>moQhXYt8W2D2h%H<-RjRbecg>uZKatn+w3VogKM&|afsY18E;{QEq
zqJ3}OGPl?sq%^$d!Z*e`V})L|AF7{I;Xi%b(hAnG%fq96X}y2=s$L*-G=}qZDtcI%
zmW_TFrcFM-MOR`+lrfKDfjwE)$Y0xLtI{eppQdzb@_o$Xz%SbBT7gDO=icHTO~;$z
zlpo^Cwcm6gaj?-0)oLfhsi0oS#&Ov$J4X)2F3Kdlo;4ogQbSxY!38gBMYZt3r8!{{
zx{F>W{B9#R>D2iL(?=OM7z<QNTtZ3qBs1_7xl;DGY{Lep8c$&Xs!g%`8l2SM<NX7a
C9l-1W

delta 713
zcmZ{hUr1AN6vywm_qwgQJ6h<r8rf1dHkUmtY;$uol*|y;*xZW+PLN<UN`9M>Fd-j$
zFi4_9Phk+f<RALO?=gC*mmZ3G$ks-t7PW^h(E`Khq2HO(TNmz!?>YDUe&>76y}bZa
z3!q)nG@~S*yK9T*yPs%hZ6{O_^@=y=5y(pgsQ{yJL!OlLP>^@El9vn0u2u_bQu4~N
zIbhes34ChH8cbjU71#{kbT#o_yB_p!^^%lv#_*@ngA@B-Ly@04&<^kgnd1t)=T%2h
zhHd_$<r_dV&O7hG7VkSe0MLQAT2~;*S=$4EFgjeL5a3r`K7f9{=-x2H5YBd1p@h@N
zCmTMA{zKRpn1yAW3-}?yKL$<%RJgmV1)zv0LtULqqJ~i+TPE}fc#Z54=@o*Xuu51X
zln7suhk_8qTF3`gw1<0iRZ3Sd9rjydve=bQ#0D|4DzkOGA6^=)iNv6{;lFp54VOo<
z!`VzOUsi<b^C~-~t8_7S7N(w){!Uo0XYp0{k+yd`-|B|{c`X}r7cyhRc{Y-{$V%vl
zL?MOeA{}Cj*CU&<Zkx{k#;NF0v9HHbUvr*J(~mcq?8-i+RT0J|z7aiShF`e%%x9eh
zl2o?hy?!g2;}2m5pT&=X#y8_)pMNl%@VHe`!3+dVK#aPFblsPcR3pKJ(+MZka6WMm
zEci0vwE-0muy%4ZMwlRcKtnPOKX@iN4$cZ?>Km@ly+tAUsW)O4z&8#nHl@;F#_^Q%
EFHvO0ga7~l

diff --git a/open_lm/__pycache__/distributed.cpython-310.pyc b/open_lm/__pycache__/distributed.cpython-310.pyc
index b5a5967da56c746ac408fa4c0089271e1ce9e264..c3b16e70eb46ad60839f0f01b06cc7339bd4d5e9 100644
GIT binary patch
delta 20
acmew>_E(HMpO=@50SNZ~KC_X#h#LSzp9X>e

delta 20
acmew>_E(HMpO=@50SKm)Hf-cB;syXd;01>O

diff --git a/open_lm/__pycache__/eval.cpython-310.pyc b/open_lm/__pycache__/eval.cpython-310.pyc
deleted file mode 100644
index e7fedbcc371673fab90437a167247e6716b0120e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1124
zcmYjP&2Aev5GJXWwAx6r?AWrCI%%350vE7kJ4S&5MbK78PDw8f&~)H}pqC?QtNjZ}
z#XqM4LXSmnJzK{f`w)GJz4oM6Xb<hIEE^?vKF)lz9L@}xwORqAF@N>PaNlF>uhRJc
zXpr$Gi9O+rX{LG1pj<`F69IykdL^zT9(YTg$5p5i=f(BJ2Y<;`NpC=dGZZ5>;}GZ?
z2J)V+6E?L^*wPKcwq}HDIw0)md$6vXBTo7TdKlpPOh5-a*Y|a6&Pk(vns(?l&2@)l
z{g*my;^vQAu%#JvaqEQrFo12`KIO23?D_%j%=v2f4mNNHgPAxL@DM&C@9v!a`iwH{
z;KLa|t-vFA43CNb7$514xuE#`XQA&8#RGQToiliXU3_x=$((J|S>I7#Jm#@kg|pN@
zWcIgn&RIFXCwc!4pJI1LjOO36z0F10-*5JK5#-8Pl*)`OSsM<mlVPl^jfOJGG{(fP
zjnK(+ZKYODS#(4-txED3WjLOulZ8*gBKBtqq%P=|7&NmSQyC}n@Uw&D>Z^W&I#T^O
z8`=JC{K0YQd(k`kS`Lm+2Cv`97e@zg`r#PENuEWivwh67a4Zjdxx0G43b9JU_6NuG
z?c1?3djEjN(c$Mkrwn-ERI68Sq2eywUJd-nN@rA*LUkDps>3+T3^ZIe!8E!;1AZA(
zPVf@ehnWdcrqe{;RA?81s#*L?WA7PM9l6KQUe(s4=BdP*@)u~`lFPQ4O34N1c(%$H
zD%8QduJlUQ&|_z&N>F&^1wc1PBS-mEeo(PIQ;tfvlpq8jvQVAf=yi+lWsE7s&P=Rt
zNYAKr0@dZo5G)s&D>nwej8vXg2$w<qHkY7O;!J5YMa51NsNGac1>%LX5WLRP(OXO{
z^{E!6`PA8>P6^XPuuxAkIWkHYm2Bz?E}=$_L|uTF8`OmtIZTRbZs-DkXJ(_Uv4p=r
z?Ni1Ro2wA}m)X=>9QIc|yIYa+?d6w+{{<B>jqzpqmh4LcpEY@t`y_;-#_y)^$PPrn
ayP_sM;ghGs+dL3$zAM^d_aE_7v;H5&Dm+I3

diff --git a/open_lm/__pycache__/eval3_seq.cpython-310.pyc b/open_lm/__pycache__/eval3_seq.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7ee1576ac717dacfb572aa3feb51e31dd69d616
GIT binary patch
literal 1837
zcmcIkO>Y}T7@pZ(+w1j5630m#C!|}NFDG#<C*gu<)hY-HsXzslDn=k{yYo7BvR~O*
zC&VYD2#HHYe*sF4`~`jphrM$0nG4MUcxSyKiG&15Fr%IKbLM^Kea7}nC4=DTHh=2=
zRv_eOY1|%pG#=w8epLwJgj0twDHVsh3RCFVr!p0Ns-w9XmZ1cc19DDT7SzKW*Lda?
zWjg4Gd7cG>=eYh##T;qp(KfE_3+0lq0u&C5knI#vmQa>>ft9)ROz;w$gh^hWBWw!g
z1j-7^NtDxkidA`q&G2b93mM2_^&%86=Ae8jHF8mdDb(gUVRe{)pM3d<EueQ17A|R;
zCGEE^$#?6Btq2R}^b3_Ov3u+u`b}8k)k}p_PQOw3OjnsF>^}PIaG#SS1<!aJu|I%1
zJU}${7uY9_XyhF4>FI4EzCWdu$oV;b#{hM<3@b2?C|6+(s<6ywVf9?Y$`Wbie?mI<
z50L(H2kDQ9bUj7dK%_IUjz}ASBAvd2^znbz)9T+K&9}*GDn7i8^))@7K(rev+D$||
z3mb@bGeNry>vIG)kew}9g$C-Is5fDa*I)~?nn~6gpNB1IE)ZU?622h$<McDqSRBdU
z#K#R98-W!H=vd)_K&ujgh&r}o3DNI%T%Ut8eCLxB2*W-H6H_f$n3y|o!Ha}>+K+ms
z?Ypk!@ut)F;3n3ftiVCoVLyoazBkM@o8#lmkL0PQXSrZTzBv$JcKy(V6U!M`GPR+^
zT67jbto6VNhSg>>#WOyj8CX$oxcLlMxj*WGna2BevfoVVn062BqrmTb(J*Tdxb^PN
zNX0u3S6Zijcq9VLhSr%s5CXa_Cmgo86<Goz(UL7_4fXxK*8bjh5DhCI;Kg3R%T_9-
zul6Z)(aWX#M#TtI_V-vW&1A++Y=nO|Wb`|ZE7}@Si6r;jk1Xe*IW(Tw_8_$Evxnx0
zW8{jiG`3kL@<Y4F%6<S|$8opE-4EqCOh*n9;#lcAoZLE;SBy2uCM!Qld3q)p4p}zY
z5VE4hx#T<HFjhqrGTn9)mtdLX7*@IS1Uj<iEX(1fZv!iM19$vHK*af%*tBunkR43a
zHoMVxZt1&B!)onh-AUHnSc&{tb9{@lT*?4K8n`SgEH`j~8Bcxh;5m504_VIhI|re~
zW7Qu-G3~G{Uc<G%Vp<RakF|aTF3Woz+YdvqBdjoR+p+3^$245paV7}yJtTgtk1mN~
z;36*6cUeA?gmh&2;#fuZLDc(!gf;AIvRt+P;{Bu?w63Lm%)1={$J@bKtiOjn9XRkv
zk}kf*MJGk7Q~Z=7oy4gpQ!8Z^O-XfCqZ-v!gBlu2%rTTQttl17P!``PUu$`tmj7jJ
P>5Z1|D{4hrR%-MQbxidy

literal 0
HcmV?d00001

diff --git a/open_lm/__pycache__/evaluate.cpython-310.pyc b/open_lm/__pycache__/evaluate.cpython-310.pyc
index 57be7115f85edd344d806546966ce69f56a8d30f..01838a2847bb55631e1a588f49eb448eb968eb59 100644
GIT binary patch
delta 20
acmaDX_gIcQpO=@50SNZ~KC_WKf*$}uvj#{2

delta 20
acmaDX_gIcQpO=@50SKm+Hf-dM;0FLZ`vpz_

diff --git a/open_lm/__pycache__/extra_funcs.cpython-310.pyc b/open_lm/__pycache__/extra_funcs.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba2702aeb9ec4daaca037559b7069c3e141d10d1
GIT binary patch
literal 6320
zcmb7I-H+VXb?3b#XE>aXot<4tD_M2eRJJ>gXTL02jw`#3_0hP9RiRQGgh(*p&gIN<
zcZM8Ya<sbyCr~XLZ6DmgK+!(526l@AMV{LC_PKp&{{gl@ANr7}pJ~&N#C7cc&gG1>
zlA;uy!MVJ=_uLPjd(Q8ibEA6Q*6_RaH_gFce@@f>o?edsbiDi$DE@5}LKAGLxy6&Y
zOuciLtEcYj>S?%!dYW!YJ<DzxPd>Cp6}Q4@tUk0yRkw<|;nsvHO7CfI-LHvqp9@P=
z-eYb<*rJNo1yK`qJe#5+7VumYO|gh)OR$?-yLBIX(yq5zZjU_~`&~~CV!X8yKTf*6
zp%=$NzdMSAKg`)ikC^Ab8Y&ggWl+4yG%e8v+C4VK2WDEA367Er9i=XeDNBq)Gh;_=
z%9Bz?e0ruImW7#Er!=(ciLq1Jv4t*5AF!!0(57a_uV~-AF)d+aH7gylW8BnMJJP=W
zD$_DOE8Sz?(5L08HMKG$D`W05`s-O)=to?XPm{p3_aT@*Kdl`$j<jhdtBCThA$K$L
zZ~?r{8Si6!M~rD-Lz{vBJ)LQxwV}n6J+#vQ>!8~&VunRg#uwPCRza<rv@$!ZCQBK=
z$EbEnTT$b)YGz~A-xHOankYm3_Vv6Pjs37Y9CbG@jM6vP_9D3(kG-D1b`VYC*zd0m
zWx6(u2Jza>VKBPk?RX<^Z7ielT5sF$?T(`$OyV_v9QC%l8>{2wKF4{aU;YUb%@$gV
zdombf%*`9W(jEArFTKPU>h0p5cR)?uB--`E*Vi#!YW60=dkrd~i(n*i%-a3W@Vig7
z_Ohhaa#JaY+j>M_x??Zd&a2^MROpc(=VlVg-ga&bBTq;&i<~D>UK0MDpy%g24D<5M
zsGp3y{oMXa6b^3rVI0Z497f%N^hB;xj}avPC@*<q$W7oW2h?izqDh$K=2$`=xjyv6
zb}ctZo85-eW#J`Wv86IqV?U8(wyqUp#jcuoEPYHEd!zBt&-G}M<gA+;y(sMGyte_`
z{EjBiVbI4LKSjKAG5h<8^t$~?*o#-k2YDq3`@ZzUp5MR>@hd1SUSjk&xXmoyU=40?
zi!C#oHSq3$KF|2wvp?c@?TXD-xWx_d%I7gO{lZV!!P7LQM~5{3(MOZ%Dzp77oMHyU
z-C=@V(h`2C3z+N?pYj2a<wGW5gukzS#8E4O${-6=0okA`s0ONo8lVMG6GVKApcZHe
zv<x}{Itik_Q=k>lY0w$aGoZ7ep8-7!`W(ms{VeG7pq~TLxO1TAK+l6-0R23u4SEsu
z1<*^N4rmp$_920awqs;Wtc#5gIHT`04@-F8#1|S5-d@I=nUpg_oWIAveG7n7&P-K*
zWm-;`5^JZD>0&)A0VFPbU`(yV7O!O1PL(8cPeDX@#?Zi5JNR=3aA8_n2hYW<@{r@}
zfRSnCTb0l(o_lQXQD|ubTK@7lv>XQ>q1}9ho8H0ycgOjut^MlteEITMuY9Td&RaL%
zdh;t+yH~Eh^Va37H-qL2nihQT$E5Je2<0S98=cO`ixXdV$B|54Pn(@i2xRGX#=h)?
zz<g;yQBvL+rL~z>ua3MROv@NZE<u{q-|BQ`7KQ1;pbyWHUU`ao&KRErGuTQHuFc1<
z?!-|zOqV(x)!qS!bPBtS-OA?r?3Y$L9l}dr#%Z<Ff%*1=IEX^G1{3aX^#L=<#w+P^
zr$gw`d5D)mIH0+KsvZB1FAq9l2Vxk<uD-FpPGTtdhrt+PkR#$945KY?*xB-u-gYMr
zQs3p6+Aco5v+Ez^da|FS24<D1@v`4JpIX?RCr6WUZqSygZ%?H9`sxM=aLZ4;&D2<5
z{o)1UQ~(}RP5SX<<maU!U>TAuseQ)j3r~^g<F0`QgdQd^$Vt%0%uqS#c?pD*8@J@d
zPwU`<P*oq0nQfAoW}L<0b_l=It3Pd7NW3_=%?odL*H<sRdHMV?uXM~Q-GcwH{QUqi
zMYNmRfY$LOS;f}ViZ5j(shYFBwhkO8uLcxO1OYHT!I7SuE3MNH!XV!Eg|inV+YUA`
zk)hLrHooShFMcU>)O=3V>rEu!+}Ya>hQ0%B4{_+ue3sXoc0K1&oR|IZP9PzblJr6m
zjq)-*$%R-WZ`T)rj3G#JWX29WZElbq<aRvSB17@xIM*?^JPkX@?P4>um%Me^8xDPO
zqhPv9-`KX@8u`+0;qGF$jF0YOg}i#q3iEO=8o>;NYZoHuf=ku$VN}m8$feuISGg87
zQ4PC?(hT9ReC3~t;yKz>QytJek_&wKGFs!mMPc)r&UMcCXBOw5qV{7&2LA+Z=o4=8
z|4~%pA9JEI|B)h#{}7JohrGi7TanHGi&y!7Dys4S;6!!)14Rw~@0@6X|C^#F|2`*L
z<o~Lu#s7s9E%AR=w9NmB6P@7ysOTjB2TpW~-&eH4zsHGA^Y8N0#&;WM8+wDS;2T=-
znoV|wS$Fwo{O*lU<Q05ik3y9BuZ#+zm2!I&h$nD-h!o-Y5RoEcCCB#xpHi%9Bpgn{
zM9jtzfu2PqY9PuqPiv>NsR1{E9ykc|l=h4^Ex|3IkKRseyQW+haGr=1+3n?w$G@mX
zs<~*i8p&u>q8}QWmX%iEW{)_4yL9~;8A(|ZAjQ54v?PVsZBw4X1C{`cMtovIK?f8p
zjq@_1mS8j)waYm>$l1s>wgx_wAa+a1Xbcp|S>PHJPCc#_HE8cnHx4M$*&kr~_$&&G
z8LS3sqHN-iXoK-j4SfZ5c^wRoNR)yK3MQ09e+QLxA<+&YNs5{iPFcW1_eahNE>b^m
zir5+b$PecI<Q!GMaj^cNADP1w{pP`2aIly^ZD0nt_BqGb5RJpJvTq@wLTs&S<YL?A
z^>pdi{84lVSO8n|dw_rgK*fnG^A{(K5~q*oQJ{J0hA&3}lAb7ZgdYYzHLgDuR%l{{
zE8G6@+1n)?*Y+d{hOo?0B7MI=h{bC#*6UGnjaE~jkS{!3uxO}U-S7EhLh3emEo@l@
zt&*ht_$ij*f(bzO(HEaY!I{mMad-ta*jZ%|@)o)tIXW^1a_#EqhN#RBPOxQ&4sS33
zBwR?!C}GQ3##c0orjWvMv=}&C11%;LpCh5!L=ViMilaxuXTFa#3x>h7k_n>_#zjrn
zE@>G*EFV#fVy3@qJNVt&4q}JmWBOPp0o=xp=M3Q_oT%@>J8i)doMhWe98U;mG6sA*
z8!tH{AYCvX24MHS-nO%K;J|Iaq~??)SsLVzh#Nre4``Oc4ZvT)v>u!oEX46*8oa6o
z-xm;tvb^hs=~?p2Xvs}aO79>y5l|o-DIsJ!7$ZSYi3V9&nbHhcXI@Go+CJ>DJlno&
zKiI9vt@(CxOCVGkdBfPPDnAt_U7E-iL4TGcxJv>25IhWWv99{6?qvM90xTCD4?l}$
zc^_MjTPPZ(rfxACWa-PiiF87GXnsT!PN2a8>7e*e0hogQ7NBs-5dZ6ch_n9<t<MA?
z?>&psaZ1|@;1s1@9dbroF0A(erPOzi$#XEv%02cjOZ1&t!XK{z>=1PmEFi5n5M39*
zy7LXh-74DF70`NkM-s0_TA2kstITFWQXPL4GusqVi|R+<e;1htb_GZ|L^6K_pvK<p
z#HJ|yG=aN~{cxn>)x#R4;E3^SSq*UBnbv13RI)l+8<~kY>scL_1P#($K#D$HnDMY|
zK`h{cf=`=C{jiZWvgQ%s9j&-U`2#*(%#6c@Y>~e0_W1Vew=c}ci5AARl4jNtOUNot
zO_wqQtJkK>+0x-+(mGtqmSHW+boqg?e{ww9SJw=GaXhw9mc<DIJ8?2=p*;1GNqL8g
z_*br{uRbM4dup&9$zTwK-Y~uJ{~f8KiFVnAN$dxATyramc4L?K&LcbS<)r~)vdNaq
zW8XFQg5BT|T_cB40xglQkxv^yZ@yU}yxC!g{5Lh|2>+1ZhC5V>-quyHB3}io{341r
z&-v<lUg}3f1cl#Wd1VWEe1VVWh)G2}_9e1GL{Z~6{s@gfpk>b>VJ`BAE<&p=(r`q`
zDs+lhFAbv}Eb$Fg=Hn!7^NGDH8$pNimB0RA?-}%7XuJA5bR7h@2ZZDI31M-ord)TM
zO9Ga>KHGu1sfZu9<;o?xi`gH`H~yH`Fi`Nj*!PS7_rB)?(_emMg|33q(FAykJB$SR
zmE-Swy<~XcBzutp+W4gcLRBn%agMpGY5AHGv6D6(5dmk(MXK<9^$Dp^P<4}beEdW+
zb_ne}ViLZiIMFuy*jP<GsS*J|G_5!n4C_B847rU7{|qDp6d$wB{ZDY^Bu97&=;8^A
zV8+{?6mA_j+l~F(6S`Q5)5fC-q+G*@oZ0C!<alGJ5RY)61E^C-0GXuEtU7Pc@>XOQ
z$QF9irz<h?4LMtFpC|$_-3t$L`yFpM@s+23sGNC8$tIQ;X}OAzJ*i7IrL=jOjsad8
zw^u<ZFMvfpOU17eGp?=12b+?dw|q@C>`B~jA=^L#oSR~TI8Mb5?GtY6$<uY~MJm-T
zOv1Hk|6Nk<*sT?9v$y(OGF?K2+=O@W_vJ3F`!*Fm3b#5NLJ5ysK?YNph+8|pvOCjb
zQFP{&2Wz^`IGV^_VOYv6aX$W@;#1qer8HeDs{v|_2j|uG!$cmqtp~e&$n$XL)Rvoy
zvrw2m?OYlCso4%6x9*R?5nn@L;O@-QTkKi5!<ue0LvQL$USl;l#$~;w*K|v7@RO>B
z{D+l3tr^HnzTbGR2FH04S<=~0&zSOEOx0d0j-C#%IL|57=u#~=arWZ_6*kCUK!e+u
zU!352-5bT~z7(fc-1)8+w<rWGa)FBHsklM~h1)7{Q$Aa*d(24AQLp-N>k{Ue4E;AK
d*@+z#7UKAa{~Ep#w~D;+oORw>wCc5${{_2<<of^s

literal 0
HcmV?d00001

diff --git a/open_lm/__pycache__/extra_funcs2.cpython-310.pyc b/open_lm/__pycache__/extra_funcs2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7174be956245996055b9cd719c7ba2360b838e60
GIT binary patch
literal 6119
zcma)ATaeq<dB!;aNRUf%x!jd@B`XfnIv3g^_aa$IJ+hiumRe;T)nh5n1ml#&@?g2x
zB?0OH)Gje##*Lg_9^8y)I(bRl+3j1N+G$_&+`jasub58z(1%1#+T7bjb>x2k0jb@U
zqom|~z`?m+{{Q>`|Af_QS;Oz`zyHP7KR>N$|4tvre>y%s4Mcx}htLG;Yfk=V4pZOU
z;p(kBx_TRqq28ubQ17Bs#GCi6LCGmGYOD9lgE?moX+xNAX--8nge6LEF{diZVh%Yq
zQ4v+V=S5A-<6ReZv4Ho2V7Ij9;(g3TyV+z}dFaZ>>$tKP;j0mQQQYbF-6-<6I)hMn
z{Vvs<{b;2n;3^RPD;}}d)9$eeR+VZUCODp4=y>YFn6TJ5FjIEOCOj^rR8LR!gQ75z
zqxZClk@A<dFI}3LsAr|fotDve=}`Nc&9qcc&3o((eNvbdC&kn_D5GpHEeQRP_DuUK
z7lkc;Mmwk+YKM$zUqGIL|2>^)fwithuk9h1{$B*{d;~pIMG@m$DrF&6ifgHrmhN$k
zR!Xg1Lk@&>OA|%xQ|V?_35Q<L=?^;Vm(~W!ORIaK+>M5A*IONjqbTyWR{Jtp?T5W+
z^)*kt>h8D$cXcSk;l^rr+w1NQLqCY4Rc{z}w>#_YVSJxs-;z%~p(EkOZsef>b+P(N
zAHA)2dS0;6>4!lN&67fRB-|G++<zZG$<Wt*dNk)H^_!XM(45AJGwclAcsrX5MuYrp
zy(lx|P<FR7qaV6Ll89zJ4zuzr805AWM4`-zLD=a@S7Zfuh%KPE><!%g%<yAxkeS_Z
z6vUZ1lvrQ0oEaoSPE83AEg~PYI9-plXb58{J@gZ~gJIvx^l%hstdsHXI<WCAO`b!g
z2OGa)XQpScI(vKjv2;6Iqo5mYw1?xY<Of@x^n$KeK^M_^JS<*d^f!2!S%ASUR%c~a
zW6!X25BZyAqs+?O;s#12^j6E*g$K|5im{(`yj|Mw*^d;1hE|I4PdJbis<FcayR60h
zKo?MoLq6d>DC|2-m?{5;_AW=N02BcWC;??)4yXWCpa#qXb%5$E01aRfSOQJ~rvWNk
z23CMGz**oa;2iKF;A!B)fDL>E_$}bK0c!UQ@KNAnz{i1408QXo;5pzt&;r`P>N`+f
ztjtK6SQG1SLuv439u)AsffX8$zAoU)jG?mP;ywPg+miy)rb=I$6qCi++9{>FSW63O
zQ9S>)F|p#ZxRhEubF?$}lqv^j4Gr}w<8L!<0^&=msPoCRbX4PuQ1?mc%ca1~-+OHD
zBn8w#0snZM0*(`e&~BZmP2Z^hZ^!FXUh`8ov!$z_fB7?=*Kgdq@zN{TIxk;){l?X6
zxBU9YHO>F-FE!1vls?H(kkneOfg8o1><mL0znIiptpLj0Z4Euy3P7nDTTggvkj$8+
zJ#hUXDWVzitu!D;Nu$-8Y7ROJ{4LOO^1>5@^CV?JG5la-b=IoA6NN!PS!}gbK?@wv
z%JnaDN*im_Us7tdh}b+CC3CG7)U@kIei%3vsA^|(3oH?@UrLr*En<Y$5wZC}kNO7v
zwY<BY9JhiNmOhLeeSK|>mY&z|`$H^U4ybOgA8xw+)}|YGw_A~)cn(L`<$S@dU2mM}
z@qU~b=v5}h1+R56u`oMV4o1Vwpea+?o=Ehy_Bw6brWd;#iLuuHo#&~hV%msO(u+m|
zFDvw2Vx^>@<{2d)d%`|G#^F>ya0i|PiUXU|Vy2|(cikA9lNq<=$V;lIhD}vvAXc_P
z+cd42A8Cu|H@W`nhQ*Gn^I3(J`CR8hd*h|67mwMGsrk47{=W?~ZF&1X>~y%Bl)<6H
zQQXGdlaeQ8C@Gn-y`~O^Ci?@rjRi3+y-_21W+tijmLK@hwkPa8Ki;-6gOLpEE`;y}
zJ9+jqfvx(p!)|vZ!Q1xUw%_+`h<Jcgw`aY)U^lB752LK;1$TXkZ7E1M5aA#z(i_Gh
zvj*<2Cwv*f0FWUvc3=}TgUCE9N25*B4ljx_9evA1h<{ejCqr||8du$ZAC@(*bdA>7
zEISplf*s-A_0iNRV$mIpkj))ay{y;`2haoIl=Cfs3GJyg7DnmJ!oGC2uqwx*EJ|Va
zkeWUWlc&s4{`e40s;*9G<{kN-d=|OUzu-~k6`kvx@%JsxA0qWj1%toG2`2wvg#v%T
z2}S-31&jY22I%L!#Q#U3%>SFu@t-MF_)j^Z%73Cz<3HwvdHy4XI{zUjEbt#FH28mU
z!Xp1qg(d!dPB_KCr*NA82PZ7^`wA=kyPR-_e}|tjzEe9_(`#%6YiO_~*vYfZdXvA;
z-+c8wc?ApXl8Z6>6_GFjW$XR|i5S)n&K;~B96dOKWbNMO6LJBKn8PxdD=3GncWz>)
z2Dx-+v}JPt;JmJAunOj~_LMd$z!sp4zRqa7rd$)Sm~i9Rou!mVzpGlRzNoco$*5JV
z9~h~Y7FJ+e4>{PnaPvATNl_9bMV|7JBu%<$W_lkoSAcsH!x1qdpgk|+!7wYrvGE6^
zL9>{#amEIYvDxz=1d&sS2Sd<E#(c+!QR{K3s6cvmI+0H<&Hf)^l!&|-iy5o})bXt2
z4=;o9hlaj_w7h`|Cw7!P3GyJ+j{YSQ$$YFGU?<6?jX62jM>dZv66~TrhI>qZl*8?w
zm6JtO<;HmJQ8^s)50snZ8?dVAKdGSy*z;M9FThoXHDzB$Z~`}2rO2)}&6~;MYu+Hd
z3od}Jd0kMz7*uiU<=KZFgt5H^zfd51@l{U_d;}6<U<)tsJ<42rBCSxzidVM1<GnWv
zIIitc?DwIWgIIcA&JYXNp{zH<_&SZINFkd)8Zgf&UEA+^L!#;?cPva<`K0niZ1Cf!
zScK&z0trzToyLR1Ib6cw<*2c9N+IMQpy<TWkzA3*R!7%IVs>zXEn(~M1qDE4qsIot
zK`Bf5ibgIJA|s9*1BYuM$AsW>#2y<cff~%==n+SmUqx&K#o%edgi;9OlbWtw)>3{@
zJS4}&Ouk;W@w>h4M>e^`v{*X^-A1-+_hBULaLa~y+Jq+9@wOY=t`PQU2>Q0y&)Wkq
zoj>gRsP4JlZF_TU!)~8feM;I{YUB;5Hi+EoQ7^d}fWd-lJvuRHi0wwyxUCxB7uX7=
zc{g*_vn1_MX1Y?k<IIFdfe@nrKWRKfBtTX|CoL;d6anqb3UNr&hdvgk(|5{`W-Bsl
zHl558@KpwGKXT@joeJU(byODq)-)t=7Jb+un2AmtcGRjmqv7KMu#^`ZEsJ_dG3BU%
zN3BrTEmj6BeTml*P{=oseL@mWA%mhE9f<xGgelmcg9;}MPQU)wIQu`<wwQ|7EsA?7
z8qSGR6n1s&Gn{c@y#*?zvU^NE1GOyOV_#vhzB7%Zqg9X{Ty3HSil5<9=ft}62Apl>
zV(W@%U3?eeQd3JS&7<7Bm(n4NK98Qu<VK6RcTxW<2sZ?75zu!jMY)<=IWCiHeul`s
zjQQ}%Ts(JBNz3UR9Q#UI0iCxd)#(VOw2ItXYNF3-T18}DBgqA&=#%+r9hT0E`DG3D
z>T&g;me$hxA;sMDbTjcyK3PCSKA$eo+U^YRym;sNSv%1{n?_tu8)6Zm#PVb@H86T*
zvXm|!EX0k2#dHbUvV^{=?cW@C@zou`KOA@H<0Wy5$WEM28+b0ii~9w-Q-L<E+)Ua}
zaMDit*rD`$e&F_3lk@+t`ct30S#Ti#`~F>rcP}F3?Pi4@Jg?EF!z0fz_WWJ{ghY{9
zC;;RVMQ8EoK{~!vB39X92mH@8NCJO{-+|Fn;@s3_3$^41JucwU<QZ?TWreM<57+No
zEGuo|av^6yvQkp{HRa2sY4DYXSH6MFpU|*pwdA=xOz6OY>L9d+C#)Q#sC~H~cA;-q
zkeIcT<PUsc?n)C-pe*7aKbm_A=@#0q{yN>rz_?-O`F&a^?xW<V@osY2!7HmyXP|Bx
zqQ^D1bQ#wo-u_Tt`72b8h=lo@nD?{4dET>z$?u&Qp`!?MFam3K!yrZ+WqbQ>H|~$^
zcrR3R8lBIHQhCgmX3W}7ir3W^+ezIPA=s1*p>opOAJ`Szs#`SU<0qQ31IXYZldur^
zi8k2-qaC|ax%(hu8gaHUZ0oVjkX>~6cYq|`eZX4x-=o{1ZG8B2Jp@58qit6TrwTUh
zMBbedU6w>i?PQ!Nn`n`-a`F_J*T~MdN7#@7q{;AOPm-tF_UEQSDPjo31YPMN#6q>m
z&RDy7N}Xb+yTLdszwY)&p0dYB!kHD+-b8YphAVlPlPA%G656at#{gTL=>tEItEeKE
z>G4^rjBBUictg%p#zmE}CvhKzm;!NbW{MHKHRT~RPdSYbo~~2P1Ex-{5>A=s-yz|S
zoJyWI{i@D(rbD`)nXn<=zTBd5f1e(IgoiUXZ9-v<Q$n1QtB6xMKC&~FW04nTrAK2r
z^(Y+4Zmw8LEpa~Ho~oy&fooj4kW~%T7>~}Y;{~xCJB>%PJgRdvbIQw1RWlbDmu9Y%
zet9~>$F-Zh&*(ZH25!bIy}_P_(W~gVKhx`aomW@|=5I-F=oQ`4Yy7lIA<AKehZO_y
z#}8{Ct-w5<Ml5vh;aO8o&=th?!!)o>=|(OFbeQ?s?o*1q-^_#@49Ch*kk2E-sZH<u
z+PI{42a&qr#0lmXvhDo3gh)o+->E=F`PRxyQwCZMe@s`7NYA>A=A*uMg(91X?*A+}
V?MMF#mWpe`r>%?Df>o`o{2$!lr||#)

literal 0
HcmV?d00001

diff --git a/open_lm/__pycache__/file_utils.cpython-310.pyc b/open_lm/__pycache__/file_utils.cpython-310.pyc
index 42ce8f65dfdbec65fb7321b2d08fa7ffeca9bcb3..e123ebdda63fcb774c69575e81e6600ea52b2017 100644
GIT binary patch
delta 2148
zcmah~ZERCj7~W}V_t6bH)@|L`=-9e-6^bx{m5(hT(h)vo5Qv>vF6*6K*7lZjJ6MNO
zz73*icpZtTuniTFAt4RF2!H&+4?ljXiMbdP6Ql8miT^Zeg6BCGvoUArkEic>&-1?L
zocB5Bz2nBX;nK^>N(#j9?q6Tra@}{ZDE;ghW`26vc~fAhu$#5e(ZXYMTLdRqXqFh+
zRBuc@sPZhY6fRx`XeM7#FIz+>ik4R|0&xak0RLphS4FE>8@Y@7tJ}bE{FiaQxWgGn
zSUatm>7ieWPqKDOm8@bb>3Yd$Y;FFatAx3HU{nA)>1=6@b2ZE^`nYs6+e&|we##<r
zr>t)NR=Dc`9e_^21_5H>)v&t&YXDD^ZDuFiF4i2J1Fl&f(GuzwJrmIfQt@@1>paN#
zYWiBRc2dQxQT8n5{9gKU)+N?K3HJ_{l?J)G>2vo6)|am;ciWgsfr^l~4=IZ_s#%%I
zbSmkznu_y!N>tQ(;s{bDBdKPTxROz#bg^QL9iR@+Ue69OBi63RpPh4%4a$3#vjO_m
zbBygEz0&JC2=^htLONf$o(<9cO2st@2i6uVbyfM<LP}JvXSw|CDvfR42frpDSZHw)
zaP$NA2rwNA=Sr4mVcV0+peP!4YC5=*(R9;g>{EC=W@tmI!H-bCcZ2h^nC7i?$9s9j
z<B~g*YFtxN{J51}ONmV!M)(POtESQ24)Y}72;damt9hQC$v4%;Y~JUD$7C_?#f*d+
zSUq&H?zkv-&R<U}>NnL|wA3}mmrNn%N9xC!GYD@zb^BU9(eeyev{WoLkc=5hGM!M3
zD4p|p+s+A&S#n6{3G1LyPjVoU(F7p|m11edFvRJhovagogTD2J*b;I!MC#AN3tgfv
zBcrB`n3~pm_gOnWPg=u)!gs}FgeU2ThDvsres5@G7pQjLjc^i??*U{<F2TACxBz$^
zAeWp{fnj<uZw?!#IgLK1=2td;&q6N>ZqO$0j?`G#s0)|%of6(d5&tXIa(W@c>}C<p
z7t`W{e2jkd2Uri4Hs`iov2u<f&j)}>8i-*xD=09W!f+P2ne)LGA*?))<WF}VmA-4<
zv{Yub^LLQ&eZVxSUzkYk5ve&zv4>g$br1J6Z6Muk8FS<Hn06(R;8OQd(gSmSkK5OH
zVCQ`Ku$r(T98>hPi5*T<KwiKR3bpQHBl%OU`yGt(zlGM>t6xP7<JbR<=aSh&n-<?(
zutzw}8NI5Ou<p*oGssOz$tTs70&QK3rI_py9q}MJ{8L1@A}YnN(&e^QbFl?$c2Yj!
zy9&x{f>PJPZ#~NSzmSh)2)O$J)*H`vKlL}8pM~SiN7Gnm6~1o}_lYq6WKZ(llwBIi
zJtIcJ+1AA@N7&ndGXU8|ppqz#p3a&@aaB#j;scl-0>~~U`_(lJX9ETWm^1e4T8hUo
z-X<VeGyUb;4T5}4@*$y&QMY{lNpVMFvb7nqpGJ$5N=oZfjf}x>AWG{05-j<)%QJdL
zNyyeA`b59V4bg#*FKc2=^x?9-wNgzfzg$S_D(f;zjqS8~ROAu&BY<>^E&HhT*NQ(|
zWJ_=nzb#C@5wHs&%e7QkrcF0^84SCTHTV-`3pYFWfYM2!aIN@?uMhijemGhH_$YIH
zcX=4F46qWg79hWs+hOel^Z^bC5Xm!~-^6gD{y7Y#P*O59GSktQ(3K1y#?%M^hvW29
G_})LB{`5Hj

delta 2148
zcmah~U2IfE6z=SHw?Es$mTuc^X}hIt*{*d{TT<9!fuK+tC<H2qX)5cbdrL39yLX+t
zg>D5)k-r2*<<K<g0~C<*V-<DP2Z=8}@SyQYP4r?+OpL|{6CX@8#sue^i~QEG58s|S
z^PO*I&N(x8)|@pxMsaarj`;oY-P!j&H(ku4UtANcfeyH@35?|Run-;08?OxsPB_mh
zG&9Nmn7UWx8D1t_yd1EE0{Q){g<i>TuV?`=mq*~gm+@782U|+sf}x6~V7MM;oGMu9
zZbw*@*39$KuLXx$l#+!Vtb^Vz{G6@L?)4NhPaPPgfYo%OsM5U#W*1#8+QhcdUqzp>
z^>n?sx@imC)qqu!3)m<?ENn->@f6@`a?D%Bwuv=I7J_S)bZWY~*+_R9!^!wM&J7-B
zd<}gqSUae6{yDal#)5wOa{d{%igfQbk1YeOXXp#>2DUp}UgCAI0Sc8y{JSBUzfsLd
zCCja(Qfe~JYe_Gy@~H?Kkc@<yR^m!pv1qFFJTqyfZ<lYom=SB&;?K#`AP-a4*Ut9Q
zZ@zK1os2TSX9VsHpqWmUb+P?)vrO@f!hyBLO6$vmteN!kE;gS1s9a;4H2C)j2<O?H
z1ROd*5n#C#&Xo**3AQt#jEJORr<RK=Y0a=a=AgpkF;g2=O@5Gu{2ScI5gVrK{<A9{
zCGJkBaZO3`!?wDX6q`7J@FR4(vfdkoc@%IEaExwL_OTP$#Z@tfzgKuH7UO=*NT`I>
zM^n{@M7qa<HMF9pyUL~|uQ1;B05LmVGt1nM!COl`fkvNIlIDt*j3tK?F;hvTbk(%z
zWWeA0hTvF*`wXtz2eo{OVLh!0LJU=kr4-W?r$_B%p7@*eZ6Lzh$X(Z2^BTOUORvXF
zt0^<4ri}hUd&j3qt9vo;oS5w7N&2y_j2)mq>+0DQRV{ianndJzfK15+SZ@KQ0Ivh&
zk`H8Hl5Q<p$R=rFeSi&QpRE6ZMFs^o?2x@9Yb<Q!g+~Dog!fTr@K}YMZbq2X%E$R)
zT6~0GqMw2x)<;DRV_PrVnwKDP^^T5R12ODjIXR|V815V|bKlx53M<be`E#9TfWB|&
zZj-7`J`KSufH|ah-j((VX+9x~J-Q@ReS1%H2GXZX&U<ltET^LDT-JStj8JXhQTv(=
zJ==r=@*n`|5^vfxY`M_g49LS@q)6iqb}@UTagU2lWPgvWb5^{H7{=d2j7<0KR_$)N
z_PD%cx&5l9+r4x9402Pl<o5=ITGzJ>+G4oR|J6}G9q`|)E`}FGruZzKZSAPV79J|K
z@LdAsq@Yx<<R9P9`M;3MG6dWqfPKfaJxl*#O?V~vsrzN_w8S@v%ZkarSu@{BnYPH-
zIxz|_up6@kVP^m*0LKBSl9k(Uq%u~1Tvb!B_&&_+2gp}SzE|&JxBxIJz{=fYXi5G8
z#ytYUm2)@W^B~CEBrg)u*z1+oKOuS~CZ9G_zNftf2_>lwsAk&aA0TQP@G=0c(OZ%>
z(uyvh4)IP5soWGV@S$akS&(Ly?W&U1l;xKT$-2tC%uxMu+B731M1M3wuGO{=*?(dD
zIXX9opWvThwj1yqK&ES%uq=mR@?scHGh^~5azq>43Mi{760H(n@vdlKtO1S?03T(J
z?=Ei#EC+M|)&k_WavQ820CX8Q1qksp=O1EtH~-@p%0kJKodeG`pc_h>Ph#pK0EgrB
HbM(f)BB1#}

diff --git a/open_lm/__pycache__/logger.cpython-310.pyc b/open_lm/__pycache__/logger.cpython-310.pyc
index 165099e3e2f3db18abcbabb994b2e4be4831a4d4..661eddbcfdf7bb5aec60f898d0197d1103156fcd 100644
GIT binary patch
delta 20
acmZo>Yi8rl=jG*M0D`^0&urxW!2|#?Mg{8t

delta 20
acmZo>Yi8rl=jG*M0D>u{4I8<CFaZE7hXn8d

diff --git a/open_lm/__pycache__/losses.cpython-310.pyc b/open_lm/__pycache__/losses.cpython-310.pyc
index 8df8a188320e175d82d10dc5a5852c4eb429df2e..d787a6ebbb8e2396077beca288ee3979ba320e9d 100644
GIT binary patch
delta 20
acmeyz@sEQ$pO=@50SNZ~KC_X#m<0er$Od8n

delta 20
acmeyz@sEQ$pO=@50SKm+Hf-cBW&r>_5Cv!e

diff --git a/open_lm/__pycache__/main.cpython-310.pyc b/open_lm/__pycache__/main.cpython-310.pyc
index b5a04d44d4538e87757cc8a758603f0cc5b0bb82..156163f955e4cb3f6479e0a526df2c981d7c9cdc 100644
GIT binary patch
delta 22
ccmaE}neokLM(%uGUM>b8xOe^QM(&4^0AD!>e*gdg

delta 22
ccmaE}neokLM(%uGUM>b8n6P~QM(&4^09r{0s{jB1

diff --git a/open_lm/__pycache__/main2.cpython-310.pyc b/open_lm/__pycache__/main2.cpython-310.pyc
deleted file mode 100644
index 47c485d1e84eb0da99dfdff3bcf9b759eafd51e5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 23056
zcmb_^X_Oq-bzWC>^*TKZX2$>o3OgE0W8(sV-~wVHK@uQH0EAc~)$Zw6J<~J2;Z-%j
zDOF3PhLWHK;z*)4ldxxG$1)vTvgO$E?!>a}#ED~Dab&CFKZ?(ZqvSY8juS5wCFHyJ
zRrmA^K*v5OGpAp@eYx+xyS%!$>b<?`2>y<3{i)LbT#7{goD$uCNhBV|&-t<*iHL~M
zDiKRPHA~~PZt47tT2X!)mch@M6~j}n#H(>DuBkQ&E5T(+D}`sYlCJhxJz-hK%5Yh)
z)yvO5tB;@kRzIFbrN2604RGC9WleR^8VuVFRfnw+j>jvb)iG;~<B7_6b*(i~U1zPU
zuD8}#@3QWyZm>2~H(DF3re$)SWMxxzv$dJyDYV^UZQ*#jvbDO++Q#vo%J%9GYX`?O
zm7UeBmDM6%oM*Pn+I2Zn>oFqs?zzus5&9RJwd=V^O}iHHdbrM>>RuiB)tr^nBAwbl
z-Km{ZwNbYhqwcfz@ko7$@3;1IyuWgwdeAz^@d3>8kaehfw{<tC*HrGQ-fP`kz0bO@
zde}P5d4rWB)uYx?jt^DtuO73GaeTP)K=ncE!RkZSL!2I|JX}3)9q0IH<&o+M>jcNg
zDkrO_tW(v~*6He_)}z(OtjDTntTWZet;efRSWi@+w4Ox0aqFzOPfUn)uV~gev0mKu
zO2m3fY_QLlbg@yGuNc<TVw2d6+-JmLu~lqCy$fQy*nymjVyDRBc}eUNyYYNBwI`)@
z{<nxDVz0=d{$;e>C-x)uE^$B{M9t^KA#pdJmi=yV&#b<n$xHTgwlVX(xL0T&)YnGl
zHOx)Bz`yL#?+_-R%W6SV6kOXaS8c?N^96Sr&se>I97NKO%7VC1EL3cXRK_V(8x=cW
zoNm@;vx#8*bhA>KyD(jl!WJhBZsB}E7Ah6H67-!dzi5l|(k_;ra=jJ|JW?pmUPC=(
zm29^-9Sm@@a&78@D=~qoxnPKTyWk>mvRrge)N7^kRM6YWxH#9agY=2>m!2w>D)oZE
zeAc6B%_-Go)t31})2-*P$wDLFs8`CxIhuV#){A+!K5N&Iiak=P7iSS!dqUP7=Tyy=
z^~T(jh+Zzc(-uWg*n74v?8@=0wk%AkMHi)wIpwRAj|#(Kx>zYVPPx>{9}Dwo3JqK2
zr`tt(kessJJiuHC`YCozIGl29HyaI7WhY-PIIb;&v2v}9ie<-@<;kXtPK156T(pCM
zNr@E~Q7d1coUw~8O%%<WjeMm(H3cXlBi$%S$Ichzl#|tiRI%=qYj(aOgJiK@L(2+n
zVLw1tuI1|}u^aUwZFaAHwNPo&5-at3Bby8o4OGT>L8dru7iaVKi<prU3_5N>x_M`=
zR?HJ%ZN~{VNV{4`OU}FoFrsp!UdvBnyX&P=u!h!KwcP@(K3^(V>|mI(Xoa<@Jho0v
zBN)Ou^0j6axP>7b^>Pg(jshsva*d|eJ*wBwQ_ojW;JCSl`<-3D<syi1=a1qBekbsA
z_9Jj3i<%$twXw(&j)AMsL>G;&^oo>-T!_4>oy*38m|K^{X{)DNs5J|fykpxU&})q#
z4!jffYG6=KR+l-95E#gueKR6yxVLuiyH=O8PNPt?_s!Ltj$@bh)f;v#U#WJURqVvU
zTw^ZiQQdL>Ptc6x2>P@EZBUa35iM#6+J6+Ma$ed5B0>}TE4)Av^&^Y=Qe;U#iPL@|
zk~O>|7i`;{cHPF&efzFmyOt}~1pq%?m$ELAsP3Bt;P(}<%JNm)*|&fHf&B*#=ce6i
zWyPRvcwWGGj)`C@;zwS{v@~B^j4VZD#zkyit0DZPenr2apa3*XlpRr?D!V}xJBqZP
z+aG8S!$<`O&{>c^Rso7(I*kWugukZY%!s!UTi*ml$<NwzxnnrJg^KeaBJH|fZ0GZ0
zcNT|8mv#_y8Wju_tJJU2!L<@0lC2)Br>obeF_1%}P3ZC%9)ZyS1zfT87_vL?ejW+H
z7%*SbTAHiP=!<~)lJ>mb(pynqUo_m<OnfonM}@wmqmJPla*vzz4L2pCH*};mxl<rl
z#MVY$kK6>PR2*~}XqSzjvvlW>msS|Q3&@QV8pZm(m~#kw_?=t$%cg@gA4Z3cTo5lg
zPQxw+MzJn!E7g=0(7b%f35=;qeKOGNj(mXEHiKgo#GSeXY2oE&V_agzK!cn^mR@qK
zl&sg?JV7Do={k$6fjB~^SRKHwBl|3darzLXfxN?-p=GoQ&Cumz$XWr^C^9<d^CA*1
zz)FY=5NSYT?P^4B_cdXxjkI)Eb9I0=x&)-2*9dTnhKMg|a;r#)<SP+XMkwDxJpeR(
zW>e>aXtB|JQ?oKv@C8C3m$*@kq02A?X%XNyl1^0+2aRu5?AvBaD8P8R8Idw!elIqh
zM$ues8|K6e*4IDHxc{c^B~(wjb8lkzDt7J7sJw?p&}IYe>S|b~kI@tu`%9?A*uOdA
zZB~Sms2`Pg9e_@C?B6Ep)~MMpx`9z9CK5wfb>zdy$H6dag(}$Q@=gZ2BUVqKZ{<h@
zG<60LWV9syCqRabsBz2C<O|3=U8K!x|D_N~;K$U+0u*T@X8;i1$|$xnimhDmb!Q8(
zp5j|@HfURM22fL<)t$|cM?}mwUXO_Q&FH-L0j<{O>o_HeR;;FrWGm(pd5#g`7Ss;C
zqJ13C6rMP1i!qTFpn9#O8+Q|aVlf$_N)J#aB{D#jOQKiw0adt+Q6(YzZ?BE{&4_{3
zH8Ib$bU^BQ`~=OS){mTwZ28(uZznIdTu$>9YX}Zl)1V(mICPWeUmF+0K%ICi$t$5A
zqs)+~#mJpn_;DZ9qZ9>g7(FM?0Mznv3W!gDC6nxxccz<=iNGYmjl6>wr^}AHyy51x
zf@3n*w}nac!Yr7I-<VY2td~shV$Qr!t`%+5rM8@1D3u^*AVDJ_q!GY5_L$OknqbOP
zs!xhkn~rNX9cpcx5D^ie<D6H2rd_qLVm_Edz3Hkf?r6H;s<t$rY60S5&g4~8ZOC^h
zqH>a3c(n4=v}@Mtt~qIAGLS?>vsj+2*g5Z&I}gb1McsC|)pX&i4G{1m9mK^Q#-er5
zx*Zd&#O7upjKv<)sncZV0Lw}R6Cf^bHf!zjoP3gY#(+TAV5%a|BK?jeMV_P0e2Rke
z6g)!#Nf0U^?kF!Ic$p$ZUcC(@nklLCP9z~n<?FIMMHOxvl-L;4u?>hY9qVb+F|agZ
zYM`GT8kX*KlnWpa1qlGV0(s^g=QxjEnRso8c;gQ|xQr=X$hgBx?J##T{WagxvTy1N
zMj#v!Zxn54aX^CZ7TG>U<l>BZwi31yq1+3iS1Z+nC@7e`j85dcD0q&7hY<uu$q|j9
zpJ4`;8Pd|Ar%TA@4a%khgCyr5#&UkpC(1Yg(yl>P1y;&;qcf|&Au$a08<IKY28k+!
zVCV@!kCKgfU6O=^poe%b$&7>tD>pPqFK=N5bsdsC`ao%k-e&Z)7Qbag<KUZVaL{Cw
z@`(nI;}=IfuJvhYeLYGHEsc`2Ca<D{74k3bG+kaWS^hIbR*+>KWLXEPtN4h4go59S
zNK-=^0?o<D3zu4k8=WEYtbw#fePc-v33`l4W~@VCLC8U1LE7X|5f=#%ST17%8xx8g
zuc(btpAxCnHECR*`bsM@@AfQaM4DMbPxmZ9NHcyEVed^Q#8*aHXzE){82>*Wyqf@?
zI(S=n*9uqpY}pm(PNz@)5URi9xs$|M<s=1yf_FOXKT3rp3V3U~j}eF!E2SpXCes@<
zOR`>FaZ<cw`)J82w9fYO!I5RegA^tXwUcC>-IO+HHPS}(PUpsgJ_ahYd}8uwE@!Cr
zEP`xYHmDPHs;1=d0g*0MAPQr}2~M|58tNb}!|*lKb%+`f`X}`yka@g=>;pi5lMjRR
zP^Cx$WBZR}4id{4!G9VNpb1nwLQ}{A1_VZ9UXz0^kXJz7fLIx;C6Gd`b{aBAjL;E>
zfRO*FA0yEcaz$juko!rF_cdSl<05_|id-F1J%mmuk4d0BqaP#?YWajna!+R=i6qcV
zaxE*FgqrOiw@E1p2An*7E{H1$KS=XcOWDH>#xi#gQk~pvR9?aIBrQG|;Owq#@kY84
zq|*pOKHu4RTTp!Y_%r9uoO|>rv#}2M?$pg?(~Kbs7f8ZYSW~3|jnfxSp0~zV4@>$5
z<4=vMFQY|*(LznQMMdvNU=25LMNo6gpoYtIT~X9%{dOGbW7O-+?@Q>SgCc!eACQFp
z<IphD_$MVfd8;ofi9@b9d8F16)EGtDIz~q{ZEG205KBqh7y}IiEmS&330i~;je{r~
zgifYxE>t_9Mx1mIqHiKewV-2kZQBUR;Sk42C~IIxoqAIi?e@JHbS&Ghd9r&?a1R1@
z=hhXoB*A1EL@ce5q(qk|%o$1kA%2%LpRxBc%b}bV#Y_}P5G$1>#4?^s?l!nY0T-48
z1`uIg-hkc}QM&t^nxdb?wZNd2;*wv*#N>S_{ThDE@dqN_R^B$g=i&RoZu;2K*tz3u
zVwo;2BTo>6REjG%Qptk>>IWBGxEZUJOSt91B@0m;MP_JX%koh$uzKWQK{>0TbcX^e
zqU)M*YrP>qhWx+44}ahN&0@cbakvg4UFS2dyadb<q|*1d28dwDBNQ%bOSrP1jdDtt
zM_i~OAt_no8jBGB-S|wxU$Yc_bx;Ey5sz&3qa;OS1Vm}zGRT2}|M@6Xh~$BYbN4l1
z7yUnnAI3=I()K8;Ma1=VdCKpvfj3i6M4a9*T7y1b4uIrb^i%$zh#!miP<)Vn-cKW)
zKpNNYNDHV9gCdDEuG*2V_&rFckS4CKdUyQ+^sa}!r_uXRNYFkALPlc_iJndi5H>$4
zGH=9M!{Ho0i#ZH;&EZEdhvBX{`~>DO+%<<UV-CYS2fWO{oPN>IAl-`*2K^z#`re4P
zMtr>BK-o9^-kA&}k-pX_#r#po2ECZkh(8MM-+xm-7P;~@UP~Vo$iMIt_(f*=#lWlq
z7>;mB|7Zkhx(FZfN6^>q)|lVtd>8cwYC6I-t#Nm7W@vHPANR*z2V~ImnwwGV)LM50
zYFj&R$X|<+;hQ-5ApWC^W0=JlD8<?(U5i}#4tg5L7{_>w&8@MpPmH2^5F_Kb=m~qC
z;L^4J#FB<Khuy)&35six1?|eZHnU)vX7pTEKX(rLJFaRkW_7C%*Qt!>AicQLu+ntf
zxE~j}mAS0J+Mu+6z)JfV46k%?*+I9jvRriJ2dH$Of;<K9L9n7B)9X)A5fN?NERcrD
zeFj6g9xCA;b_M;Ws=*kjToJS#GH||L;#+Mda={vP&&=dN-NS*Ay_b6Q5m<4OES$Xj
zFy*lDAH`Ldm$*>4N)*jAK;}#@n#<+BL#5euB(F?4@_wpMoJsaj@Dc*6ze`GS@sdDh
zIQR6WC!fq;xOn{Hsr<<^CoVEz<oi$*q)6=p|3ZHZvTIa-aOE8c$vYvBcC0>bP?GkQ
zrd=z}1^vr3THTb5mI}Dq7G3O(^W{nO%r|WE6b&3};MTh>e^H;-1l}pwP)eX!TrK08
zxT^<@uI}BT_f+dw?d-ZcGS_~(H=;g-x=EM3g}lYD(&!5mEbl#SyZj^ta|m#C$P>3x
zKq)0wSUp|L-s<6%s~0NPNY}DSkW@;jHMBY#9o$}*H#|sUp(VT;a#%_X;wovW9idCd
z@TQXBv5MgY{XCcD!F~q|aA<kSH4{*aAO=7&TK|J&RM!nqeO-&cos27~k|ay|=}?)%
zl0=46I_jb3_mYW~HC1gc`+hRcY8JSNervs+)CX8bH6VhZ<qwjvq_%-nGR=6&_^R<Z
zuE`cLsTJxNi6Zo4_CnmtkH|QwWJFSMBRs5bl5vqf$}wXJ456DG$X#fJGr1EFmfvL^
zF{=*;linFYw1NaFtPufLDeH6X_cQq^S#LJp)U8bS`x9=v8oG&M1(z+&242NR98w*j
zdSX?O%WugqBDq3@cmNq|@QdQ-+&xU{<#nv(BM4`-#sESnjI%n&SVqQOxiJ%4jAMn{
zg^qY)CN`7Ir^0wT-y@=&o6cw4Uh1o*qnAGXh>RN`@(<B1A?1OX#wj1q#LXDp7rHbe
z*PV!n<UCsD*C1z*S;@P&bckAXN{5je;nEUfP$HYqx7;z2qTV`X*AN?*6t3kO*H9xJ
z))<#BBY(nq8F@Wn9tb%4`37$GqMbfIL$Ss6A|raiSfaJH?p?^)Ft06c1T4PPPR*EI
zbrMu#Iqhy*+`JT7+#>queFDlR<gdG1uY+ODL(>txi`$^%^pjeAQx^n@?HnTrtfAPH
zv4l0{*CD+_49-Fma(B*T7q=n33%_-k7la1X*zNAQPCK|)4Ba5Mxfb<u5GZspd;_eG
zO62#%2%h_>-<l>yTM5+{SbbuN;J%yF`%z~%v$(Z>Y(aC@0!m|)hdmotcf3d)z!;``
zaOO}=M|d|{-{am(weY)V9wUmih~0M;%&~Egm{4)lIxJ`{D*t#_{*m@v#Co2~(N6j<
zPTx=Kn<vKGLi;0l7CSWn1*~TSdOwCT`W?mZ{!$czLvJhTCsEr&?Hq)l5FTLJwOUlQ
zp;gLnVdqoSvXw^o0O1#MKd;819Q@W{ZqZh{Gn3y(-h-(1^WjX=V!itiYNR;5--pf!
z_<fant{LvbKJc%V@>6K>+wQ@|<6@(q0!&SaEubW|6z6U7)6042e*(3?$rzIY#-KG;
zxdZu+AphI$BLowoHm#m;R=V7aQZ;|X-4jTi?5rZhnNFHlMev<DHE*;ser7?JeLRoN
zJSPKtzlBff>3>2$J$}#PqeKPW$7asBkNZ7nJ0Z5-far|a6Ji?#J6%44ah??0c^ty$
z3FN)NW7GQ10t!3v1h$GSp6BqS*%Re3Z?ZOCOLJ|)II11O(R~W(^8)9K;R1T3s0DiW
z+^3m3_11C_?a!j*8RUKl&vAN+JvD;F*(L2&*xev@0lBZa7sED&dr8Gsr~;`2kp<oP
zu@%Zd@65B<rDq}hUd9u@nRhWh>GC(OLxq@m7jiDI2UE#Y?76V)+Z3k~P&&{|jQuS@
zpLX*(tS(0-IJvkPAu5MnxGwjC9$I~B-Ke#XR*t8I9{!i#yZCO}38W_$4B3z8_(D|v
zEm}~V=_u0sQSbSs2+qq0RHuO!(Vw;M3;vpUZRR~V!%;b*Qqx+iAGoS7M6bks*e^Ay
zY#;0%_E8JWlc>jtnkV3Y;6kJpb$^IjxrLcYw}>_eai0U~-Xf|l(E2H~KE&d=4K7l`
zZ;GDcZrt1q)do>(8np_jb<c`gR5PrFI{O%AQTZX%yVotlhUYLYG<z3kXjS~`@yP4O
zO`Utli=${8Tbx~ri2K(?#Ie^6@c>aa4HxSz;x849_!^#7dV+U77;=dk(l_1uOam&<
zL&pfJRP!g?D>D)~4|5K+`3iCzm4o>l_lIdu+L$RGA&Ts}es4*G`lS0qZWBFS6(??B
zXJ?3Ky(mt;0R`;GwNGfRUi5GZD>{9r6^Tbtn<zT)0)6JiW4EpBOm~giI95npU=G-T
z`!;_>Jbo;KHr~v8XMCEGc*57Gw6f-psL@fk#TaVHb?*CoI!`T7BI4LP!u;JnEZRUX
z4Y}KWKX_XouU64(11E@R-IG|s+0`omzMNCkkLtKD`MvXa>qZoT+dRdw75q+|2YnrN
zU-n1E)ARbPd!e#@z@4XaNf$dWY=Bz$3T7~l8Ls6%p7F;q$^}vbZyMB}KeqV6<vejw
z<t`Xk?Ay+@;k?>?uEiysjfocVuk)?-UF-iT7c>NO*PoA4d24;`vnc&=2$Mg=t~^V-
zLwO<fz7AM@{kB%`RXN~Mmx)#~U*Az{pxsBfub6z;{V1itNA5zeL_Izthz@*g-oVQ4
z5=p4ghVz%eSS!TXkD+A*qrENO?ccR7GNt)<4Pz$xkAZqpUmu%C4yaAci9a6sfcD`C
zFyMLd!YfJl<J>zg_95@wP^YN1!Tkj8OYfT30qc#ejjE2nk?OQee*<y{Tbuk1{wAo{
zCWSNtl@CY8B9wnDvJktn9lv!ro3HvCct-%s{DQA7#BiU>au~%x``#0-fu&I7;Zj@t
z5UgcB!86BsRhN0-{YGmuep@h_!pWEf7R<EUz2bh7Fa*1|xwXZoS_HXI4M%2U==Bx%
z)1p8lfhKLmZyV3#>+Wlq;b-VwbZh%mq_sm#(%AlXem%0&{p`%={2ihw#2dO#eCqQQ
zn<D<P9ltN&_roPhy)Nt<hUyooCq#*0Ex(TVIG(?SotmPt-8bejvU?MFG(r2W=H7ma
z=^L>Yals!6@!-F>KT0Y5o@I%dVvAwfce!k_Q%13+g}7YL=b_AJ%2@oO-wTPfH$1uR
z(iz^5zL_7xP7bs-flGeTM~ZfG=Es+`c0B*%3-K$&)fKekGVA^`IsyJxe;Z*UptuWA
zj7tY~)Goy3gNTQ4N33rm(%M~1`B^cGzJFroOUT2Hw1}Jkq^JN&artidO&_u{L-eP7
zD*5O5{q%giwW~JH^{QNN#{HS)dh=0~rhq*)s)4g#@5rpX7UJ~}Aa~c|mqGa(vr$Sb
zxd|Bf70enmYvwB$8T$Yk6HoUDLpAPv4m)*)ck0OE&!Y4gphvY2qt>@V*^6R_!}?!A
zeaZC?E*=mLL;di~&!NRv@go}Y^HYG6qFuth0ZU@!9YAk?;2xg&7r08B5KZP2hZldr
zALaJc(^cLfZ6V>j2laO>#GSw7eZ9stLRm}6KDY)a%KIXh5EcDJYJqw04R<{uN8Equ
z{u1r#;x8+!3+y?yne#V+Ucv?nRHd{3;6%TIJ<bXbryCNzzh^$y%60nM?r*|LZ1_3x
zUT_pe{S)$8_g5ER4&~#$!pF<V){_3}5U%q;huvRW{B=LKM3xp8)o8cT;DqyZHw|t}
zlbscUb1nV`E^A5t_)Ss7o+jjrozWd$&HESv|B7nhG*eBg^|Je~38$zhf1SU3HrCqj
zPqYqzjvVy&FQlB`>-6#k?&badfi_I22j1s|{2t=Jh2Fk|{eFq}`)kN~m%o|n;P>13
z(P<}X>v!;55HH_IJ|F#v);iRpIumI7ZSes`*QlPNX|226-(CDS{@vH{S|8l--+ovF
zU7km+q<^=PXMh3U#+a|rnCcXTGx{HtdLVM0^hVA7z3aI6_7C`n(D%OsXZhfbIF;k#
zc3tb9aD~ZG=Km0_P=H5~?Tg}zJi{bzHhy>U-?K#j`-B1hKEDrruA}qr?;<Grd&Kn{
z8Bo}JnLkb}B=Ht5Y}0}6I{%*WNb5d-i+>;b?IWH5dVKFf(mBHr{s`^Uf>wK3L|cc|
znus#Aj`;V9BwGB3@rZu}R6By+uVasr&UK-q=0TCXFVZ^LS<Q9y_N4eIujXUzvC#hV
zFkR+;+&`j5@8sO@@AHqWBa6NyWBdV;#LNzW>K_%JpN0IJlA3=2_5VQKXJF(XAsmsK
zcckk~pa!3bqn(+2a~b>5`sf|;|L8x6|4HYMy73>D_E{t1{^3Gu4pM#Ve%S1<?~L|0
zSoa_K_cOL5=6+-GKf<0AFlBgrf?}8Px&n0W{`cKqgBJrmufe82>>(wu(yr3#yCLKs
zRPYh6;xz+l*9!Mj(BQy{{mmWOYoouJc)fp2EchGgtN_b6$Ni}L$J9<q3iH|o)%j$*
z&O++S7XO%%F<#gQ92#$}6`%6Qu?OS8E0Qcz((?B#{wG}aul3h0DLwKqa@~;Lrktq`
zy*<46O{yn8jg`EHab8;+DYJ%+aZbB`(&kfG=}}<je#k}l4Fj*%(VE*U`?$Xuw5&Vl
zGyaw^XQ8X!;&qtK3eZUZu=vakXh!})l>RBCu$25|_tXAC;M-^0xQqDbl>WxJ^Sd3W
z{|;mS=V^cNQfEIX0?A&qzCY>I`4ji>;<xAos+U3wX{of|FYx{bKJKxw&nM9CpQ7!j
zWsJxEVIEt>U#B?wnZOu>?K9j>cdz5^WA=@6xcHr`;bo;;E?zvX-UAFh%62p6B$<Jk
z4z4xLo!<7W3A;41`4Z);@F;+_mpM6S!d`sRmSn;9==n<y>Y3R+CM>gIpVfp3T%p#n
z5+nOqT#@JGr*LEGT|QGX8)UtYR}FjjPM7ww&CecX$OaqZI+v;<$G)-`2AuSY7WQVa
zauaNR(Kbihhxa1(Ubji6X*uus9XsYJ?cGb3NOvBi<TS)y7-AP^bKU_}k!Ekw%N-h(
zW?wGRi(<SL>9SlhY2LZFw0&=Bd%ZQ=p385m=C_Hx+aAkrJDcBj!6JKByOOt}6`8m6
z$-EU;7P~MhhLtod1Q1E`TZVjn)-&*`d9Rm1n8UjfujkZ@c$WgdkeX|Hhs{C-ZV+P5
zR4*lR<|Wws?cGb(r34x^!#T5H*6eGj({7myV%M-ClR*@IAwdt!0APJQS;zZv`6zb6
z8eG24$15g&7a@zZMVn98o7hM--}cM#<;o^30s}4QZ97+g=Q<_sf?gzzDK`a^Zoy5b
zStQ?uQnQlthMr;n1y!KN&&eg4nNeySxYzse8J92*ek}DWAVH>SSS{H)9zT2Dtdf(<
zUc9SNUM|=jSbG)Nr1JFn1NY`k3Nc=z0h4P$#)d&*sVRZ*(uCm*wp1A;E1YE<+GE-b
zOMq)`8PRgy1J5*TZ2ou$3{X~`)n>&l<4uqm`exWp&OFKPBS&E@lsm6HHq4#ME)99f
zJ+LVN5|~~J$HuL@6y4q)C$to$GqT)gyGJuH$f?6SvU~#h<rtPP%;k5FCLBxHsYCe<
z;e@g;j+4dcV+7X<_JPbhYcR*WlE*Pa!2q8~IxOl)nBH1G`JLTdKCz~J2wMdt9>H<!
zI097)b1=OqRvhncSXwmsluy#TQx}^7ADKecF<}ymm`kLCMl#{oWU9AZxop(x;aYh?
zZxgMIv|(Th8*G)Dgdr@t=ahG(GeDy$L6ZLQqj>8cv0=t#n`B_4W`%cnl1SQ7vq<ke
z+kf=lh~Kl|`PbtJUdDT)7ZPpPi)TU4ujuryh~E*xPBVr+>`KRQ<2LKV)UhcWb}9||
z;Cl4^*JL82zcd>0#>kSj%c!kg>n&}Mm+77@*^@o&>A6E*)O_?2@1CcKOewOpcY1D8
zmT>~)%8b4BgD^#@H!FhaneFDRHRK~iE@AZYlYPRhHEgW{jKix)<rL-(0KyC+sygKy
zHM5%Bhw<Ux(Rm}T_B0!}23Grk!IhFvK)x)jem<=<m^!Ui554mY->O^vIMd4BRrO}A
zsg$d5#=<$wPvO0!Egf&-c!{>?pR{B0XE0gb8Tn<3{VWCa25{MaMSc~@!`KS>^N4sU
zkbN@Ho;AImt^vIaVBIOg3q8C*FBkEGuh5vb(w${GUe8$~`sU7Q;P^Wt5*t)8BBcqD
zVB-Q`2C{Hjg--C;>7|ohhTRyACFECV?(ejU(^!S05NdtI>s`ic9C<Hsp3mM<j~vx-
z2y-rc7PM=gZknEc6y!WDrwIUCvE^jxK-`(FnF=6j;IYc^mU&(!2Q-jWne5+=vk@Xy
zXMP7H@rD&mEY|H3h-?`=jB<%wsh8FcS+Ir1i;C@o%|2)zF!!3U>|{O<1jex8s8bjV
zf8Tv*1w`pslBMksevXNNZK_FyVc!s+E^i<z)76!-s+J!_CD5zkaOiyZ1lb%E>uR7*
z7j}^KcZIOyFe2Xgszcgw1@Rty0ygP8$lQ3xN_x4>HTRhCDg-m*{ah)><W?*U!!gYv
z6EXP;1vGIrsSkBxjK&8jNw#1f&G9-#R?o3x{pCIOq%!YbY2`LuhNMU6+LkfMOpa{q
zO}x$q=7}yrPce{{v!!hyq@&q76A0L#EfsEq&d9BIyQOLJ7f=fZDay31CVvje)u)oy
zbe-^-fH8qS!#^B}ct|5M*KBe|fR|>mFh^c!6=}sL!5Ez&YBwA!3bx|qm|l=18VUkf
zk7BM>h=VNRp<2GpJ3^ZrE5$1%yHG3b%$6H@#>5~77E_nr;0aiLl*u6@JP)~?H@ch`
zqL*}FKx4(Yr4?5Wp;nB&LA{|%wb@$z8aOZTgS?*P9@E>A!(oCHRv-bJ>_Ff+Y8Ipo
zmc-tqd+>yelan_93~$|em<-^skk|t{X?96>Id5cDA&Y@Ium1u$k<z?X2B>xd0t@-H
zvN4Vo1Eg%nGGJ2Vuw5!moLqr%Y@#HWj-7_ZgM&GDh(y@+2FQZERO-Er<dCOcgna}F
z=h=5CSKhDGD3srF-XN$-mly+EAk#{cFhi&uBtr0dAdL>BE+;qRU>^gm;6W^=j^5G1
zIOk;;0H*SQ@E%xg=g6X9I}_cr!?mD20GV>(gbG$`zgVn5tbs|84%bu?`$E$nGVb|@
z&oRggiP!(|N#2eF#V7f;zUY)Ov4hEnI%RQUAJ-$Z-RFyYhry;gT`ywe$Wa?!$6eN7
zUFNZBG95aS#przIh9lmOom6I^By@8Z!~jE0kmv(D8x@M=&JE&G0o?<|2>p3Naf1%t
zG<js#P%5hgz&&u+2_;YR?m>SLK+BHPwAI06DP-B@$rMd;R#pTloDa!HkWg?wNM2k6
zNe&VsahFiiWv}`I`K`)d2dKOhAr$#rn_eGe?Y1uyR+CM!C6=<bW3#P};WP}@+Gi80
z6Upb0i$GVHLx4a`!rh%*{=7{W+Z~3;jw!QRcCK)aWbmf##ZRfx`$^J=KRkPRk+?n7
z$snzblw@S4;O0fELv(-FIZz7HCL#hjQcVU5bUz%2F1xjmfCX_ijy18&e~9bCG1b0?
zC(VJ4)2yB@qiKBKA~gIZ{XFl*8C3g}$s!qi0bKt`+cx<I0UnfV*`Yi{1R}r?=l~(0
z5H>BmL+FAF`^<TdlX}pGN{CgYw3KIUoWc;qm|)Ez0mHP(r`j>?+*A(ar%Qgha0vRF
zlFEq7{lB@Jd8)?JPa97_kU;dHo^`3QIq$C9@>Zy}Yi}<O72NE9mg@l<E6x4$sl@k1
zLb$U9<wsG>W@eUAY>I-Fft^{Q%^0jCN$yaF9XQMuTo52Djbl~Eu9glR1b=CYTFpvL
z!c=m)T9C8YQ&Mf<GYH~YF3Pnu_L|T=im!R{bj@R>;W1x<&k5o|v|4!48pJI^ehNZ*
zS0$#xG*@dR?19K~SB24bqgDCI_V*a9G-9E;8Kj@3({YM+j?GG~UKEl>nCI&lWhuu-
zqBenqBn|d%Ais9?IRkR^t4fj)tSI<tkc8?|qR(kal0?|<Q~oN&NNbWODIn`P`Dp}J
z5)4gQijrbyr6|cdd629YW^Dlze%5y652HAU345}syl{fp$y1M9dNeT3oICwg*v`Ic
zS0ov-!-pHR2A^uc*9*Rcw0x6-&r<L?Djw)!)I7S?qY#U~vS1DHfv(o^$q495!0cf3
zNm$WdxR^ij*r^jwoPX-fxr_O8$IqUUk5QN84;1tk@hJ^RvzQ?y0=boj8R_aY-!Y!G
zhP$)D`rUf5UXfd<ml)&-=>~DQ>49knNoh|3WpJ*(Nd2XjRks9`LAnhI;!5$c;8BO0
zj6A7L{KW)z5|ks<Y<&4P3N`53BK)d`)faxkBfN!?#MC6QH!FP=-AyXAOoX^%g(l5i
zRNm?*B<s!?Bps=X7-EjrDETqmA-|U{&)Aa_(ifcd>Es2;e4euNdHH#o%>aTRRch9X
zpqv#4pXMOx3375DO*6@u3K2mjIoXyWtps00;4t=a4Z56wfO+8FAPV1IOX**9CLL>_
ziVH9~M|v)n4C-(7a1*7+hwW7&Uq5J#s<W|LbC)R^Kl8}(6Hma~=w$x<Gp8ikkXo6?
zo_pk(Gbi&G9z(V@l<!XFty9lDb&zeiWiM48;Lf|X@?p+dt(vPz+@_yf0|Mt3D735>
z>mWo)64vRXG}hQjU{m;I6%s7YhXslO4`K@JX9gg-GBD_C9r83X1LG<_tP&((AYO1$
z5>s54d@cdXj*4iFuzCJ+22aJ7ze)W*O~Efw@B#(wpZXDs)hJ-2Y;sPKpFt30a0f>e
zgvk;N-$~{iP{_iH2k#s^gjuoUqENkT>DPo(JPxSGLHTRcTa1EH1UT}zJz;kmXBPKt
zC0F)S_Gou@8BFJDSwP0LGW^{W+<8LBu!BSxk=LkAvT#-D(Cl%As|P@mRt`m_hmHc%
zVo2~ATsn8;7kL*ojpNu>n+>ZUMMdDVfc}vOBe60{fC7uaS8C)2)K!-LuTiVs3(YF{
z$lPTXcF5tk>{^2?^pl-4sSbQ*tR9u86bRfq;{}5J0(uA%AfyDpWlzVzfN17OlSVpD
zRSXj0L1CL!=xqb5<#%h)16_td(xtD<HPm<q0&7j1d3FSLxt;Pyml<pxw@-PcE;?3%
zz70dCZ@7JFt3+dxd599w6~DCxNE==~g&+NqKS?8wQSdeDJ~3I4a37O$Bc(=Y5+M1|
z<jBn#+4#6l!|Ejx)uFyY>eP!xn-x2J1%s*tDNM2mvR$0D(%1%YbbQ07D7R6c>4E@5
z24%%Szd_?JQtsHwk!aKL$sX6S;w4ti<WAJeKA^<ZHJ$6DwlpKZNuBhocN0wPoI=UA
z_&Zc|@n^AG1sbR>|E*r)io{kFugXx8z9hsu`d_IpurJUx8_TR$na^26tC$RbtO$-2
zq^>FVL~?(S<T%PcD3FEJ0l~GgwGlJz+$MLf6iKcK9U)TC*Tblymg;g6O8^0)Ys&Bj
zITR7Jl5(SoVk?x3Y?H>NQ*VtB^whN#_O5*;rM!5wvbyF|-~Y&$*=M55m!qR7$Ump9
z`fGJ{69G8U{hIt0<N}$17V@`gz^_v}uBeyQ?ZZN!e6irm&W=JBJZ&5YLK}S}s<UV0
z3}lV9w;EIJmgB8aDhO4dF3rvwBpg;wkhHVlhWNk?DCAY7>;}Hjqh7^8<RHB!&qK1B
z1TGYM)Bv%pzP1p~%^Y@d2{hXp0PP^RBW^+tJ#EdCuh4#r)=X+8URFT+yoVec2BE>?
ziza$n&lu!Anv4!c@tvAm!_hRpS2GM3kqP`V_@$%de?I_!(^35T^f-L(4Na#K{OfPW
zZ>3Y@kB=JKD8f-Vpo~WQwDIUTQskSu9zBw;zVVX&gSZh#Yk2Qtw6vZy26W`>`b$ap
z`44M@)GJ0HrxPPOic!c-h1{Qwx8req6+rLKNyZ1`8JY`RqmwcE1_0{e>n!w(--_QF
zj-gb~Ab&Hy9e_1roDJ*>lR=;4D5K+N+#0|N;ogBUd0eb|Kp#}S{UB|OqgTv3M$VJu
z=3>Cj1r9Vc7XFjpC%2b&D*pXUd^6T-U|cl|%omP07@Nk&FRk@OsUQ6omVzhGSu_4#
zPsaalSI<_oq*<)T>@t8P>g!t>hQmk`EJxuKu77{Stz?3}fH25AL@fr{ADGrky%J0?
zySD)!)EU-IK%X4TH)FMcZ*%~o?1T%@IQK!Y2E+g<gLVmyJ?P~J2Jiy8e{ZB{WVram
zc;4s*KF8n7BnNbY6V+}1Z_qLZ+G8g(yq6hXAwfyOC;nC@sb+_MzdxATj8O+@4frv<
zv6reg;H<^pN+!vf2w#D~Y}2SkJ4|1E7(^=?0r1hsv4#z|GUy4I%(ZEs@k0(qK@RG|
zQAr1Ws9C5n5dNM1cDSNmg07C5N$iU8gFXYJ!zl`5-6ANAGhXP&RPEct^#4xw(;kqU
zP#j~UZ_EYXd*U58w0i;dq+!5?U)KriCwR`rO9Q|QUXeEZlKyr&u484q^0zYyf(w1`
zV))j8L93_T8>Bv8(iQ%oX8S+Q@G}rnPW-PjgF2vN+=}zbhRM^*@t=#lfADGa!<a~L
z=ss0AhxBa?KwthJP>B_nU2$Y2@H>H@L&_I@SfCQA;-g<xeCi8k68Js_e172%j$cAV
zVNx9Byd-_lM{dUFzVM+Um}|t8QMic1ln$Q_P;NSGXhA)s-5zr4aWimR(qJ&jA3H*R
zuWwNPl|%3yMnCvOK?<KZ>Y0y<%#A2UPYIYg!$`6ZW)%JBlvg#rGe}yPD{VXIFHhmN
zi#aC5{2F8m(#^cBEK=soQ=J#|ojZ5+UZ38pd)a^V+kJlj>J6X3uyh&J*8k|b?mJOH
zO$iO8>arq=o4~l5m^XM5>;(;$8!Fq3Y#)3>r<-oMBL9Y({VfG2DPXPn6vendi(<4h
ztSFJLSg7z<%b-D#bN~&C1#A)!!p~JK_h|LPOo7!^B~z>^HYI=Sg;u~>$}^SUa4f&(
z_y+p=d#uYLZ5<yt1G*ED?gPSR5RL*_flO43^f+QVn8l}P0=mrNq4r5T4F~v-BYBSg
zPG?_`e@taF6!1!qAO<A_^66g6qq_z9Ed;n5hj=<C_fR=BL3g@Ro>X@d|BdSWB?TnS
zDi1-hI(P`OB%h;OoE#~Nq!K`rrOz7Xx-3c5MaO@q!c6#?!W;_@bPyoNuCAXW#Z1x(
zkuOm|`X?)atoAAATyn!ziru<9x6je1GFg*V+DLDEWR{0m_r*(I_0dF_q>yaPLbr8S
zJCpFl$*JpjAYh);k~KNH_y~H=ROupwvECX}vruiiu0X7qYW610m(+j02(Y%T9HdJo
zbaJ>PQlI&he8<2#77OOqP`gz(a0y-b+Kcamy4N&?4<4vHo6bT*;f-(T*0g(DsYtPg
zWOcyBLYh(n@PQEi#$+zMwZ-ddjBO3Jv-xDG<!+)$ZKhxg1zRZ~p`TS3)@xXfmw!Zg
z2Prs4!Jkp^7zM1!UZt2v!OIk|61zw-y0T%q%I87;DU$q!rEa(0&dCar$FS$P26<3J
zJ%^4=5^`GmUwwy8g~V^@xMjeHx9XV`o}AYC37O=U4hf3t>fn!54+oWN5ckYzJ2fMd
qjwa)Mv1CuWC%H2@lpNAW+B}7x>Ay-(ClitPxBru8(gUO5EB_Z|Izw0h

diff --git a/open_lm/__pycache__/meters.cpython-310.pyc b/open_lm/__pycache__/meters.cpython-310.pyc
index 3960ad9bf5039a492653c26ec07743231183e29b..81eb4a49fd09c9dd7bc8ab1c6b39491a7e5dbf13 100644
GIT binary patch
delta 20
acmew?^;wELpO=@50SNZ~KC_WKgBJipp$2XM

delta 20
acmew?^;wELpO=@50SG3QHf-e1-~|9Z)CF+>

diff --git a/open_lm/__pycache__/model.cpython-310.pyc b/open_lm/__pycache__/model.cpython-310.pyc
index 1c8b27d50b5dc612e64d2b6e12307a8fa2943de5..bc9767b8b8d78c36323d54eabedac63a01461365 100644
GIT binary patch
delta 2689
zcmaJ?eQZ-z6z_fQx{bBl7#rJ4_9Y8jBO9O_Y!e}BL}PRqCw>&;^Vz<&Z*+a#eXr|=
zo6|5sQ2faCi$xR=O+XVRP5cK1qrnIoqfuk>)M@-cjG&oDK~2>2yWK2iVw3*fx#ygF
z?)kdsKAe6c?Kl1YIWGE5e*8<XalrqjuO#G)$F+V<<)*Fi;gI;#caTMe)>xDI*uRCb
zUZIvW5@Woij}3_*OX`^^N=tuchVab2rfz^tIo`BxL}NA|q6Oy!`xL$r^xfi;1@qW&
zW_)g%RT|)23Mu2OKr_I&D4W+dCe7r9W2wXjJ_vT2!0`+kdYWH?buYjIsF1kBn$2x2
zaa^!Cdt4J3+aWJbYL=~T(1+DQEuPYy!jz?}VJ#ff`2&y_-mcl<m}P9#LwN&g+^Z)X
zriR4Z^Y&GlFft5~CwZ^e6PC&Ogt))Frr{3o1ArI-oU{rE3S4EZpgvH*W0be@%w%~b
zqg?%3F%U>VSDKU|KZT_ikQ8^%U)m_Ml#|E=$}=jUBM&mgNAvG$f11|8Jid!Q4qM}R
zqIx{VcSC6pB;H&CNT%7kqok4{v8}R}O^AJ!-}^_Q<>&Wc$7Zp+YF=;)$a?|109ygl
zp!D$>EMb~&%hpg#RrLgAAQRxq1U(5#dMY88oSmY-x{~cg6Nx?5kKfQoisWN18QByN
z*KwzpUEnC`3n@4x!|@skHDdI0c;V!mDO;{K(2CDW%!7gP*8508d!L8GLx8;mj>j@6
zKQBP$&ZUMwCu5lq>lUo+$i;HbF!s~_Z|HBq-2zuJE6wBk<;^(0Ag!#F4LwexH5>st
zJ}YfR<0;iPc{s+OCDwNy0v;g<GN(vOCb>Ce^iz34Vn=PHAr3+FS!t22F55x&drZ{U
zH7=H!@C=zrE9C7-8&_Pt&hho?dPMCtd0OKU4mW3-Dp9wH^8RSuRn&i9)U_-{or0ch
z3S@VMi4iL3H+5CxQEOICGZ)t%VeJ`Myb~}6kb&L>>RJFomf;0U`75AvU3Fous}4aT
z%OYC~QDg@Ms=^NgUIWbZ*Td(%J%+o>1>g|?t}1_-fC_vN^bvqOQd)l#qQbN4`|qs|
zi-q!gdlM>k5r2$y9XF_4!^-=GcJ5?U<bX?jvha%4x%)30{hZBlfc8z%KQ{oDm(Aq|
zGFsyyR)flB=NsJ5_D8UGy3%6hqE40)gNu6R!-(Tan7p4Sv1JEftN46T)v~iIcZ}Gf
zLWE)uCv`QO$642*xoNWDKZ+JCnl2D*AH&@b<f@cC5R!KQGsTEChf3tgVf=mK2Wg&X
z-fu2<m*wh0DZ}7>N1Q%?Rl~d33{9R7cn=^=uO})vhi?=&EM5}GZEhmA!we@ct_|y)
z@0h-qnZ@y$W8n@x6dOfri&^$5>`nV4&3-0cY-wemi^-Oz=u0>vue-Z@O*c)r8=-m=
z;ATLsnkp?Fcb{R?h>23hb^ZYqWRWL9eE~RUxsQr?YYjV^+16UC1ipqUCIi0=a5-QV
z;3@I?(uwegATI{^2po4*xA}1_3kgDnnr$c4gx0TzN+hX;ZDUZg4Kp##u=D}d3TttV
zhbp8-j-hJtWK84Vio?s^V<+Wnv-g57*7eV5b=*lane|Bec7l#X`EWeiXx5DvRc&jF
zklXCV_-uOIZ*E)PATzFTm~gytGiun@zsBiX(`B3AzAXDR48zR-Uz<}R*<Qg;iOub6
z7k)+RG@M|cp8}i&90gb57*za(IMd$7rbNr~n)>UAVY_TL;@ZSEu~GW=DG^tS@oL)*
zTHLn$7C#0t^&4#Plj7awl}q?sqMeessYO)FrrApmO4yd&=@ijp)2nNCitE<Q1T9%n
z#ZHLk6)Vb%X`_$LQHohHD^SXrN8!JdCoN+mtWW84k1DpOFK<Mlv%L`(QMS8Bn5x+M
zv@VFaBcAQbC~<Zbj<_Rv)vn!)xkgwIJ%ioZaln6qk3R!`A^68>Fl3LP0fm<*I~sli
zg;;pmufWp;l)ULQwRJ3AI7gvrgpTt`cZ<n!a`_J^%Sv9#?-5%%2H0NlTSt9G$Tvg}
zNWadlR(e`!G$t+ns%Y)3W+8E9=W*tUC0#dq-@xu4#Qj~XJE9<O1;hY50rKfN04ket
WP_mL!pneDZA^s|=6oHEdng4I23ZLx&

delta 2767
zcmZ`*Yiv|S6yCY}przXu+R~O1S}F~&k=B9~DWb*)kQgl{2xu|hTz2p7-eq5%d$&t*
zn}QHf6a^+I8i;@>@&_=L_=Cg;5fg(EBr0)8(14QojxhlfpXZy~dNG71_v_3#GiT2E
z&Y3g2tGK;bnX9ZE>!ZK>&L53;zFB!OTrI=tw2?7%o^uR7AjRL|<IEC9+vMSMm75t$
z3cb3G1RJYUY_0gEdJ5|k<Hr5QR*S&+2U~11B|}BiN+cb=mL}XGIHd6s#v8=llWN%$
z!yCsJS;J~b?tzx^J27T}O`@iD&X6(_j3x`&)qD-a0|YYAZ<$4YH|9w|0iZ+Uj~Wi|
z#}vm!s@&tIL-0KGRe8g9^ws8o-fyG}rYtYmrXDq-Nt16QeQ)_H!-*zsYpp4ReR?`%
zW+l_5I8}S7u@6R`0H~8f51Uy#$N3)dY~AG6LCBi`DS{$>b{RpLuZERPX)5C>;;nA@
zLR|wRuC7ezYszD-GO0q|hiM4F#rpcrHpP;sQ3R?pCSZ*^s84)ezpiUP%_BknD*cme
zkqjivbb%khqSv4ac?6Koa7?Kc@=|PRXl8rFp@ttShp?)WZ^n+N#J<Mb$kP~Z0l+AK
z2A~WoA7955runmO4#lsHOCu_fJ&-DbUcpFts=y;>ugFYnV0%$SV*kV!m#1kV`RI}1
zQb1DaFV31Rwc@Q59LSI%E33z>42KtTOs?RlS%zl(I%1x0+Ssv`G_>~!7VH2VB#;5y
zB7TlS=l7`Lhg2+k#Ii{X=6SK)FpR^r|6BTOxLf9{V&j7Ru(}zSCl$2?s-QQKY7s|(
zGHe&EM7p3mIUY^&H%RndfPlvdB1~2o`8?09u`)=Hc&RzonuTJ<HMFQwSLLAUJtdl3
z+GZ*y0&8-4TZWbvt;G*4li|2&#`JiO7Y!ccaC5Y%vMtky_nj^GQ~iD2(%y+YMFK7b
zs=A^i=q=0SOx@rK`#MgC@0fCub-fLX>j3KkD$pk}dI*4!Rd|tc{C$kQs=C#yst=%1
zX;GzxDBK2ttnib74*{d~wf&}3L%6$qC*TFZHo$QLO7MFap8}{OmGzIID!(Sb|KBQE
zvUf~8p4F*}_%2#2{TO)#tL_)dxtmdvn|$K@)Wvss_g@wI4U6L#?faEJuK~=h8PAUm
z8*T5f$;hnR-{8JmACcyfOp686x>-^5Pg`0KBQlW9@eJqKvK#P>_-b0?tZOrOhQz()
z2*nx5o4U*6b=P6W^b4))U|>DqzobPfMiK<+8QlFQFH7!$(3}N~CL=kcw_3Fv#?O&F
zLj8RB(;0RC8ZR$I85-|darNdqTR+2QtmI<=UjUTpl{AWs<t4Fv<}FR$=IJDs%#y*h
zF<^4Od*ogY&rFXV3wP+DD2a~tT+Kz;8}Ub(y(Esbcd&28h4$%*w{S$Tr>AF84|TW|
zSo#RyQGl0Cou<;CvK(qL31ZyjUt)nu@;4Y=2HY^+r$xGBGCMuIrK4GE`W{Pv0Ne{$
z2ynZ&IMMlXRAPPyAWR_r3De;}Vp@)Ax#2ijJ!@o4S*?af)Uo;v$I4|#1h(0y+fgHJ
za5+J(@C3S%&L<82lX!R58FodzHV0<UXY=zgk6@)gpUb-*Dc^o38PW$m%ILZqFB<18
zsQ|a`#ke%m?l;X@*{T@VI84Y;I+w5<`+x29laaL5cg=F22Gp#}=nZ+iD)L<u*j2Hq
zYsu8_XgM_}*ymRPR{*CW)i@dz|5^OrHHZBn25)N*Ia<k=@{82m$A-{usj9Hj-7}s>
zvL>1~Y}<<KnOw|F>v_XT%Ib8^i0QUNJ(nJmsH3V<R?s68Hw~x2O?#ws!azl~kkOS9
z(<Z-M4{(MmYL#HsJjBA=+J=zYFsst4SeaJG0vZp{jfiR_9sLW^03Y&TFCuV!4lDUK
zunlZbD`_b$<sbArfmE>Mr^rGv7Sp!+OPa1ZV@fRMj|H}BgMpH^x2zP11t<FUG3G02
zOX(r)p@y=Y?D0PVe*yj>pjPNHqOOTXE_v~BKc=9HlPNS+L@394bP?aT&+D98PEpuy
ze26VfF;aXX>{cal58o`Fo7cw<?&zMvq-gD~opb`0s*{C8!pbJ%M%2`kg;gBCjigx8
zeVO%&x${?qKEm0*3o-w$ZVSW50r(}w_W}+B&=RkSL)FC}kbz%vVn?`MwBFv&!v6wL
C{;$0N

diff --git a/open_lm/__pycache__/norms.cpython-310.pyc b/open_lm/__pycache__/norms.cpython-310.pyc
index 7f750f638d858da6f6ebdf8819e3ab91ddec5d9d..ac74e173918b02d67a4456dc74e112daa0f74022 100644
GIT binary patch
delta 20
acmZ3izF3_*pO=@50SNZ~KC_W~hA;p*`vx8W

delta 20
acmZ3izF3_*pO=@50SKm-Hf-dcAq)UCQ3WOd

diff --git a/open_lm/__pycache__/params.cpython-310.pyc b/open_lm/__pycache__/params.cpython-310.pyc
index 84ea5725299625109715be4a66c396de2f6ef100..00cb7b283d5dc7b2c1e12cd4b7536e56c6a99925 100644
GIT binary patch
delta 168
zcmbO-g>lLhM&5j0UM>b8xOe?)TI@#NIf9J0Ca)5V1CpFVvlx>$FA-{B6~D!roS#=*
zQpE?Rbd&S*b5iol^EO*c@G~<uPxh9cz}U9=j`VVF#^lW{>J_Yv8Jqv=Y-VAcuz925
z2~Nhe%~jS}oC--mYi@D+CFZ6U7bGU97I_2tMLr-R1VprLW^s*RVPu~i>zTm@Qd`tE
I`Lw4e02`w>8~^|S

delta 168
zcmbO-g>lLhM&5j0UM>b8kexd}EoLL{96`pY$*Tn8fF!5TEXIV*ON1I&#jE&|^Ye;J
zbd&S*b5iol^KP+%`J1gJ_?a0SCwogzU~JiZM|wFoW8&r(^$J$Tw9S8YHnTAHZQf{j
zf|D_2bCq=#r$PeInp>QHiMgr81&PV2MczPukq?Ln0TC^mSzIGn7}+MrdS<XCfJG*s
H_Vfe*yjnD~

diff --git a/open_lm/__pycache__/precision.cpython-310.pyc b/open_lm/__pycache__/precision.cpython-310.pyc
index e7ce9386747cb9595dc0e1446cd6893c28a7e0dd..053e197f34c15657bc49e6e066632c6f820459b3 100644
GIT binary patch
delta 20
acmeBX?Plf9=jG*M0D`^0&urvwU;+R!Vg-Q!

delta 20
ZcmeBX?Plf9=jG*M0D>u{4I8-|m;fuI1cm?r

diff --git a/open_lm/__pycache__/scheduler.cpython-310.pyc b/open_lm/__pycache__/scheduler.cpython-310.pyc
index 9651a64268c4d5ab6815528c6cf3ef49a63487b6..d4301f7893c35517c65085ea390d9a3622f3f473 100644
GIT binary patch
delta 20
acmX@jcbbnopO=@50SNZ~KC_YAfgJ!ls|DBq

delta 20
acmX@jcbbnopO=@50SKm-Hf-c}U<Uv;0R-Rx

diff --git a/open_lm/__pycache__/train.cpython-310.pyc b/open_lm/__pycache__/train.cpython-310.pyc
index 9b1ce5f2dd4327cfa93599bb0b145d6c2777afef..cf3cb396946998cc690062fd7dab9133e64d00f2 100644
GIT binary patch
delta 20
acmbR2J=vQ(pO=@50SNA0|GJU8Llpo$c?NC(

delta 20
acmbR2J=vQ(pO=@50SM-IY}m-%p$Y&vjRl(k

diff --git a/open_lm/attention.py b/open_lm/attention.py
index e0e8aba..7f2e2f4 100644
--- a/open_lm/attention.py
+++ b/open_lm/attention.py
@@ -111,7 +111,7 @@ def torch_attn(queries, keys, values, is_causal, attention_mask=None):
         if attention_mask is None:
             bias = None
             # If we only have one query, assume we don't need to be in causal mode (can attend to all keys).
-            if queries.shape == 1:
+            if queries.shape[1] == 1:
                 is_causal = False
         else:
             if not is_causal:
diff --git a/open_lm/data.py b/open_lm/data.py
index 309f43e..05844b9 100644
--- a/open_lm/data.py
+++ b/open_lm/data.py
@@ -186,7 +186,7 @@ def group_by_keys_nothrow(data, keys=base_plus_ext, lcase=True, suffixes=None, h
 def tarfile_to_samples_nothrow(src, handler=log_and_continue):
     # NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw
     streams = url_opener(src, handler=handler)
-    files = tar_file_expander(streams, handler=handler)
+    files = tar_file_expander(streams, handler=handler, eof_value=None)
     samples = group_by_keys_nothrow(files, handler=handler)
     return samples
 
@@ -205,10 +205,12 @@ def pytorch_worker_seed(increment=0):
     return wds.utils.pytorch_worker_seed()
 
 
-_SHARD_SHUFFLE_SIZE = 2000
-_SHARD_SHUFFLE_INITIAL = 2000 #500
-_SAMPLE_SHUFFLE_SIZE = 20000
-_SAMPLE_SHUFFLE_INITIAL = 20000 #4000
+
+
+_SHARD_SHUFFLE_SIZE = 100000    #10000 
+_SHARD_SHUFFLE_INITIAL = 100000 #10000
+_SAMPLE_SHUFFLE_SIZE = 200000    #50000
+_SAMPLE_SHUFFLE_INITIAL = 200000 #50000
 
 
 class detshuffle2(wds.PipelineStage):
diff --git a/open_lm/datapreprocess/.ipynb_checkpoints/make_2048-checkpoint.py b/open_lm/datapreprocess/.ipynb_checkpoints/make_2048-checkpoint.py
new file mode 100644
index 0000000..d50763b
--- /dev/null
+++ b/open_lm/datapreprocess/.ipynb_checkpoints/make_2048-checkpoint.py
@@ -0,0 +1,255 @@
+import jsonlines
+import glob
+import tiktoken
+import os
+import threading
+from webdataset import ShardWriter
+import random
+import time
+import boto3
+import io
+import zstandard as zstd
+from contextlib import contextmanager
+import argparse
+from pathlib import Path
+from transformers import GPTNeoXTokenizerFast
+
+
+# ========================================
+# =           Global variables           =
+# ========================================
+
+QUEUE_MAX = 10_000
+BUFFER_MIN = 100_000
+BUFFER_MAX = 200_000
+CHUNK_SIZE = 2048 + 1
+SHARD_SIZE = 7813 #8192
+SLEEP_TIME = 1
+
+S3_BASE = os.environ.get("S3_BASE")
+
+EOT_TOKEN = "<|endoftext|>"
+
+
+# ================================================
+# =           Utility functions                  =
+# ================================================
+
+
+def write_to_shard(chunks, shard_writer):
+    for idx, chunk in enumerate(chunks):
+        shard_writer.write({"__key__": f"{idx:012d}", "txt": str(chunk)})
+
+
+def upload_to_s3_and_remove(fname):
+    """Uploads file to s3 and removes it from local file system"""
+    fname_split = fname.split("/")
+    s3_path = S3_BASE + fname_split[-2] + "/" + fname_split[-1]
+    cmd = f"aws s3 cp {fname} {s3_path} && rm {fname}"
+    print("COMMAND:", cmd)
+    os.system(cmd)
+
+
+@contextmanager
+def get_item_reader(file_name):
+    """Creates iterator for reading .jsonl files or Zstd compressed .jsonl files"""
+    if file_name.endswith(".jsonl"):
+        with jsonlines.open(file_name) as reader:
+            yield reader
+    else:
+        dctx = zstd.ZstdDecompressor()
+        with open(file_name, "rb") as compressed_file:
+            with dctx.stream_reader(compressed_file) as reader:
+                with io.TextIOWrapper(reader, encoding="utf-8") as text_reader:
+                    with jsonlines.Reader(text_reader) as jsonl_reader:
+                        yield jsonl_reader
+
+
+def pop_random(els):
+    """O(1) way to pop an element randomly from a list
+    NOT THREAD SAFE!!! (so make sure we have a lock enabled)
+    (also mutates the order of the list, but that's okay)
+    """
+    random_idx = random.randint(0, len(els) - 1)
+    els[-1], els[random_idx] = els[random_idx], els[-1]
+    return els.pop()
+
+
+# ======================================================
+# =           Processor/Consumer Subprocess            =
+# ======================================================
+# These get called in a threaded way
+
+
+def process_files(file_list, buffer, enc, buffer_lock):
+    remaining_tokens = []
+    queue = []
+
+    def dump_queue_to_buffer():
+        with buffer_lock:
+            while queue:
+                buffer.append(queue.pop(0))
+
+    for file_name in file_list:
+        print("Processing", file_name)
+
+        with get_item_reader(file_name) as item_reader:
+            for item in item_reader:
+                string = item["text"]
+                try:
+                    tokens = remaining_tokens + enc(string) + [EOT_TOKEN]
+                    remaining_tokens = []
+                except:
+                    print("Failed to encode string.")
+                    continue
+
+                for i in range(0, len(tokens), CHUNK_SIZE):
+                    chunk = tokens[i : i + CHUNK_SIZE]
+                    if len(chunk) < CHUNK_SIZE:
+                        remaining_tokens = chunk
+                    else:
+                        if len(buffer) > BUFFER_MAX:
+                            time.sleep(1)
+                            continue
+
+                        if buffer_lock.locked():
+                            if len(queue) < QUEUE_MAX:
+                                queue.append(chunk)
+                            else:
+                                time.sleep(1)
+                        else:
+                            if queue:
+                                dump_queue_to_buffer()
+                            with buffer_lock:
+                                buffer.append(chunk)
+
+
+def consumer(my_id, output_dir, threads, buffer, buffer_lock, num_consumers, upload_to_s3=False):
+    output_directory = f"{output_dir}/{CHUNK_SIZE - 1}-v1/{my_id}"
+    os.makedirs(output_directory, exist_ok=True)
+    shard_writer = ShardWriter(os.path.join(output_directory, "shard-%07d.tar"), maxcount=SHARD_SIZE)
+
+    chunks = []
+
+    start_time = time.time()
+
+    while any(t.is_alive() for t in threads):
+        time.sleep(SLEEP_TIME)
+        with buffer_lock:
+            lenb = len(buffer)
+            print("Length of buffer", lenb)
+            if lenb >= BUFFER_MIN:
+                while buffer and len(chunks) < SHARD_SIZE:
+                    chunks.append(pop_random(buffer))
+
+        if len(chunks) == SHARD_SIZE:
+            print(f"I am {my_id} and I am writing a shard.", len(buffer))
+            write_to_shard(chunks, shard_writer)
+            if upload_to_s3:
+                upload_to_s3_and_remove(shard_writer.fname)
+            # print("FNAME", shard_writer.fname)
+            chunks = []
+            time_for_shard = time.time() - start_time
+            print("shards / s", num_consumers / time_for_shard)
+            print("tokens / s", num_consumers * SHARD_SIZE * CHUNK_SIZE / time_for_shard)
+            print(
+                "hours req for 1.2T tokens",
+                1_200_000_000_000 / (num_consumers * SHARD_SIZE * CHUNK_SIZE / time_for_shard) / 3600,
+            )
+
+            start_time = time.time()
+
+    # Process the remaining items in the buffer after all threads have completed
+    while buffer:
+        with buffer_lock:
+            while buffer and len(chunks) < SHARD_SIZE:
+                chunks.append(pop_random(buffer))
+
+        write_to_shard(chunks, shard_writer)
+        if upload_to_s3:
+            upload_to_s3_and_remove(shard_writer.fname)
+        chunks = []
+
+
+def tokenize_eleutherai(tokenizer, string):
+    return tokenizer(string).input_ids
+
+
+# =========================================================
+# =           Main function + Argument parsing            =
+# =========================================================
+
+
+def main(
+    input_files,
+    output_dir,
+    tokenizer="EleutherAI/gpt-neox-20b",
+    num_workers=32,
+    num_consumers=8,
+    upload_to_s3=False,
+):
+    os.makedirs(f"{output_dir}/tars-{CHUNK_SIZE - 1}-v1", exist_ok=True)
+
+    input_files = [glob.glob(input_file) for input_file in input_files]
+    input_files = [x for y in input_files for x in y]
+
+    # Shuffle the input files
+    random.shuffle(input_files)
+
+    print("Input files", input_files)
+
+    enc = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b")
+
+    tokenize = lambda x: tokenize_eleutherai(enc, x)
+    buffer = []  # Use list instead of queue.Queue
+    buffer_lock = threading.Lock()
+
+    files_per_worker = len(input_files) // num_workers
+    threads = []
+    for i in range(num_workers):
+        start = i * files_per_worker
+        end = (i + 1) * files_per_worker if i < num_workers - 1 else len(input_files)
+        t = threading.Thread(
+            target=process_files,
+            args=(input_files[start:end], buffer, tokenize, buffer_lock),
+        )
+        t.start()
+        threads.append(t)
+
+    consumer_threads = []
+    for i in range(num_consumers):
+        t = threading.Thread(
+            target=consumer,
+            args=(
+                i,
+                output_dir,
+                threads,
+                buffer,
+                buffer_lock,
+                num_consumers,
+                upload_to_s3,
+            ),
+        )
+        t.start()
+        consumer_threads.append(t)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-files", type=str, nargs="+")
+    parser.add_argument("--output-dir", type=Path)
+    parser.add_argument("--tokenizer", type=str, default="EleutherAI/gpt-neox-20b")
+    parser.add_argument("--num-workers", type=int, default=32)
+    parser.add_argument("--num-consumers", type=int, default=8)
+    parser.add_argument("--upload-to-s3", action="store_true")
+
+    args = parser.parse_args()
+
+    main(
+        args.input_files,
+        args.output_dir,
+        args.tokenizer,
+        args.num_workers,
+        args.num_consumers,
+        args.upload_to_s3,
+    )
\ No newline at end of file
diff --git a/open_lm/datapreprocess/make_2048.py b/open_lm/datapreprocess/make_2048.py
index e0da8bb..69e7429 100644
--- a/open_lm/datapreprocess/make_2048.py
+++ b/open_lm/datapreprocess/make_2048.py
@@ -20,7 +20,7 @@
 # ========================================
 
 QUEUE_MAX = 10_000
-BUFFER_MIN = 10_000
+BUFFER_MIN = 100_000
 BUFFER_MAX = 200_000
 CHUNK_SIZE = 2048 + 1
 SHARD_SIZE = 8192
@@ -252,4 +252,4 @@ def main(
         args.num_workers,
         args.num_consumers,
         args.upload_to_s3,
-    )
+    )
\ No newline at end of file
diff --git a/open_lm/datapreprocess/wiki_download.py b/open_lm/datapreprocess/wiki_download.py
index f5f1a05..a4e10da 100644
--- a/open_lm/datapreprocess/wiki_download.py
+++ b/open_lm/datapreprocess/wiki_download.py
@@ -30,4 +30,4 @@ def main(output_dir):
     )
 
     args = parser.parse_args()
-    main(args.output_dir)
\ No newline at end of file
+    main(args.output_dir)
diff --git a/open_lm/eval.py b/open_lm/eval.py
deleted file mode 100644
index 196f006..0000000
--- a/open_lm/eval.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-from open_lm.params import parse_args
-from open_lm.model import test_classif_model
-import webdataset as wds
-from open_lm.data import get_wds_dataset
-from open_lm.data import sample_chunk
-
-args = parse_args([])
-args.per_gpu_val_batch_size = 8
-args.vocab_size = 50432
-args.seq_len = 2048
-args.world_size = 1
-args.rank = 0
-
-args.model = "open_lm_160m"
-model_path = "/media/logs/classif_C4160m3.2B_C4DCLM_320M/checkpoints/epoch_1.pt"
-
-args.val_data = ['/media/datasets/C4/C4-shard-0000219.tar']
-
-model = test_classif_model(args, model_path)
-model = model.to('cuda')
-
-dataset = get_wds_dataset(args, is_train=False, epoch=0, floor=True, tokenizer=None, data_key="txt", force_num_samples=None)
-
-dataloader = dataset.dataloader
-
-sum = 0
-for sample in dataloader:
-    (texts,) = sample
-    texts = torch.LongTensor(texts).to('cuda')
-    inputs, targets = sample_chunk(texts, args)
-    
-    with torch.no_grad():
-        out, _, _ = model(inputs)
-        
-        pred = torch.argmax(out,2)[:,-1].sum()
-        
-        sum = sum + pred.item()
-
-print(sum)
- 
-
-    
-
diff --git a/open_lm/eval2.py b/open_lm/eval2.py
new file mode 100644
index 0000000..2d5121d
--- /dev/null
+++ b/open_lm/eval2.py
@@ -0,0 +1,96 @@
+import torch
+from open_lm.params import parse_args
+import argparse
+from open_lm.model import test_classif_model
+
+args = parse_args([])
+parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments")
+parser.add_argument('--model', type=str, help='Model name to use for evaluation')
+parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint')
+parser.add_argument('--str1', type=str, help='test set 1')
+parser.add_argument('--str2', type=str, help='test set 2')
+cmd_args = parser.parse_args()
+args.model = cmd_args.model
+args.classif_model_path = cmd_args.classif_model_path
+
+
+
+###########################################################################################################
+
+args.num_classes = 2
+
+#Dolma_gen.pt
+#DCLM_gen.pt
+#FWEdu_gen.pt
+
+#'C4.pt'
+#'FineWeb.pt'
+#'RefinedWeb.pt'
+
+
+str1 = cmd_args.str1
+str2 = cmd_args.str2
+
+
+base_path = '/media/datasets/test_set/'
+
+data_path1 = base_path + str1 + '.pt'
+data_path2 = base_path + str2 + '.pt'
+
+
+
+###########################################################################################################
+
+model = test_classif_model(args)
+model = model.to('cuda')
+
+
+
+dataset = torch.load(data_path1)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 0).item()
+        
+        sum = sum + n_correct
+
+sum1 = sum
+len1 = len(dataset)
+print(str1, sum1, "/" , len1)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path2)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 1).item()
+        
+        sum = sum + n_correct
+
+sum2 = sum
+len2 = len(dataset)
+print(str2, sum2, "/" , len2)
+
+##########################################################################################################################################################################################
+
+
+total_sum = sum1+sum2
+total_length = len1+len2
+
+print("Total= ", total_sum, "/" , total_length ) 
+print("Accuracy= ", total_sum/total_length * 100, "%")
+    
+
diff --git a/open_lm/eval3.py b/open_lm/eval3.py
new file mode 100644
index 0000000..9c1e761
--- /dev/null
+++ b/open_lm/eval3.py
@@ -0,0 +1,116 @@
+import torch
+from open_lm.params import parse_args
+import argparse
+from open_lm.model import test_classif_model
+
+args = parse_args([])
+parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments")
+parser.add_argument('--model', type=str, help='Model name to use for evaluation')
+parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint')
+parser.add_argument('--str1', type=str, help='test set 1')
+parser.add_argument('--str2', type=str, help='test set 2')
+parser.add_argument('--str3', type=str, help='test set 3')
+cmd_args = parser.parse_args()
+args.model = cmd_args.model
+args.classif_model_path = cmd_args.classif_model_path
+
+
+
+###########################################################################################################
+
+args.num_classes = 3
+
+#Dolma_gen.pt
+#DCLM_gen.pt
+#FWEdu_gen.pt
+
+#'C4.pt'
+#'FineWeb.pt'
+#'RefinedWeb.pt'
+
+
+str1 = cmd_args.str1
+str2 = cmd_args.str2
+str3 = cmd_args.str3
+
+base_path = '/media/datasets/test_set/'
+
+data_path1 = base_path + str1 + '.pt'
+data_path2 = base_path + str2 + '.pt'
+data_path3 = base_path + str3 + '.pt'
+
+
+###########################################################################################################
+
+model = test_classif_model(args)
+model = model.to('cuda')
+
+
+
+dataset = torch.load(data_path1)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 0).item()
+        
+        sum = sum + n_correct
+
+sum1 = sum
+len1 = len(dataset)
+print(str1, sum1, "/" , len1)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path2)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 1).item()
+        
+        sum = sum + n_correct
+
+sum2 = sum
+len2 = len(dataset)
+print(str2, sum2, "/" , len2)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path3)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 2).item()
+        
+        sum = sum + n_correct
+
+sum3 = sum
+len3 = len(dataset)
+print(str3, sum3, "/" , len3)
+
+##########################################################################################################################################################################################
+
+total_sum = sum1+sum2+sum3
+total_length = len1+len2+len3
+
+print("Total= ", total_sum, "/" , total_length ) 
+print("Accuracy= ", total_sum/total_length * 100, "%")
+    
+
diff --git a/open_lm/eval3_prop.py b/open_lm/eval3_prop.py
new file mode 100644
index 0000000..8338786
--- /dev/null
+++ b/open_lm/eval3_prop.py
@@ -0,0 +1,213 @@
+import torch
+from open_lm.params import parse_args
+import argparse
+from open_lm.model import test_classif_model
+
+args = parse_args([])
+parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments")
+parser.add_argument('--model', type=str, help='Model name to use for evaluation')
+parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint')
+parser.add_argument('--str1', type=str, help='test set 1')
+parser.add_argument('--str2', type=str, help='test set 2')
+#parser.add_argument('--str3', type=str, help='test set 3')
+#parser.add_argument('--str4', type=str, help='test set 4')
+#parser.add_argument('--str5', type=str, help='test set 5')
+#parser.add_argument('--str6', type=str, help='test set 6')
+cmd_args = parser.parse_args()
+args.model = cmd_args.model
+args.classif_model_path = cmd_args.classif_model_path
+
+
+
+###########################################################################################################
+
+args.num_classes = 2
+
+
+str1 = cmd_args.str1
+str2 = cmd_args.str2
+#str3 = cmd_args.str3
+#str4 = cmd_args.str4
+#str5 = cmd_args.str5
+#str6 = cmd_args.str6
+
+
+data1= "Llama1_gen"  #"DCLM_gen"
+data2= "Dolma_gen"
+data3= "FWEdu_gen"
+
+base_path = '/media/datasets/test_set/'
+
+data_path1 = base_path + data1 + '.pt'
+data_path2 = base_path + data2 + '.pt'
+data_path3 = base_path + data3 + '.pt'
+
+model = test_classif_model(args)
+model = model.to('cuda')
+
+
+soft_max = torch.nn.Softmax(dim=2)
+###########################################################################################################
+
+pred = []
+conf=[]
+
+dataset = torch.load(data_path1)
+
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        out = soft_max(out)
+        pred.append( torch.argmax(out,2)[:,-1].item() )
+        conf.append( torch.max(out,2)[0][:,-1].item() )
+        
+c1 = pred.count(0)
+c2 = pred.count(1)
+#c3 = pred.count(2)
+#c4 = pred.count(3)
+#c5 = pred.count(4)
+#c6 = pred.count(5)
+
+
+sum_conf1 = sum(c for p, c in zip(pred, conf) if p == 0)
+sum_conf2 = sum(c for p, c in zip(pred, conf) if p == 1)
+#sum_conf3 = sum(c for p, c in zip(pred, conf) if p == 2)
+#sum_conf4 = sum(c for p, c in zip(pred, conf) if p == 3)
+#sum_conf5 = sum(c for p, c in zip(pred, conf) if p == 4)
+#sum_conf6 = sum(c for p, c in zip(pred, conf) if p == 5)
+
+
+av1 = sum_conf1/c1 if c1>0 else 0
+av2 = sum_conf2/c2 if c2>0 else 0
+#av3 = sum_conf3/c3 if c3>0 else 0
+#av4 = sum_conf4/c4 if c4>0 else 0
+#av5 = sum_conf5/c5 if c5>0 else 0
+#av6 = sum_conf6/c6 if c6>0 else 0
+
+
+
+length = len(dataset)
+
+print(data1, ':')
+print(str1, c1, "/", length, '=', c1/length, "with confidence ", av1)
+print(str2, c2, "/", length, '=', c2/length, "with confidence ", av2)
+#print(str3, c3, "/", length, '=', c3/length, "with confidence ", av3)
+#print(str4, c4, "/", length, '=', c4/length, "with confidence ", av4)
+#print(str5, c5, "/", length, '=', c5/length, "with confidence ", av5)
+#print(str6, c6, "/", length, '=', c6/length, "with confidence ", av6)
+print("\n")
+
+exit()
+##########################################################################################################################################################################################
+
+pred = []
+conf=[]
+
+dataset = torch.load(data_path2)
+
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        out = soft_max(out)
+        pred.append( torch.argmax(out,2)[:,-1].item() )
+        conf.append( torch.max(out,2)[0][:,-1].item() )
+        
+c1 = pred.count(0)
+c2 = pred.count(1)
+c3 = pred.count(2)
+c4 = pred.count(3)
+c5 = pred.count(4)
+c6 = pred.count(5)
+c7 = pred.count(6)
+
+sum_conf1 = sum(c for p, c in zip(pred, conf) if p == 0)
+sum_conf2 = sum(c for p, c in zip(pred, conf) if p == 1)
+sum_conf3 = sum(c for p, c in zip(pred, conf) if p == 2)
+sum_conf4 = sum(c for p, c in zip(pred, conf) if p == 3)
+sum_conf5 = sum(c for p, c in zip(pred, conf) if p == 4)
+sum_conf6 = sum(c for p, c in zip(pred, conf) if p == 5)
+sum_conf7 = sum(c for p, c in zip(pred, conf) if p == 6)
+
+av1 = sum_conf1/c1 if c1>0 else 0
+av2 = sum_conf2/c2 if c2>0 else 0
+av3 = sum_conf3/c3 if c3>0 else 0
+av4 = sum_conf4/c4 if c4>0 else 0
+av5 = sum_conf5/c5 if c5>0 else 0
+av6 = sum_conf6/c6 if c6>0 else 0
+av7 = sum_conf7/c7 if c7>0 else 0
+
+
+length = len(dataset)
+
+print(data2, ':')
+print(str1, c1, "/", length, '=', c1/length, "with confidence ", av1)
+print(str2, c2, "/", length, '=', c2/length, "with confidence ", av2)
+print(str3, c3, "/", length, '=', c3/length, "with confidence ", av3)
+print(str4, c4, "/", length, '=', c4/length, "with confidence ", av4)
+print(str5, c5, "/", length, '=', c5/length, "with confidence ", av5)
+print(str6, c6, "/", length, '=', c6/length, "with confidence ", av6)
+print(str7, c7, "/", length, '=', c7/length, "with confidence ", av7)
+print("\n")
+
+##########################################################################################################################################################################################
+
+pred = []
+conf=[]
+
+dataset = torch.load(data_path3)
+
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        out = soft_max(out)
+        pred.append( torch.argmax(out,2)[:,-1].item() )
+        conf.append( torch.max(out,2)[0][:,-1].item() )
+        
+c1 = pred.count(0)
+c2 = pred.count(1)
+c3 = pred.count(2)
+c4 = pred.count(3)
+c5 = pred.count(4)
+c6 = pred.count(5)
+c7 = pred.count(6)
+
+sum_conf1 = sum(c for p, c in zip(pred, conf) if p == 0)
+sum_conf2 = sum(c for p, c in zip(pred, conf) if p == 1)
+sum_conf3 = sum(c for p, c in zip(pred, conf) if p == 2)
+sum_conf4 = sum(c for p, c in zip(pred, conf) if p == 3)
+sum_conf5 = sum(c for p, c in zip(pred, conf) if p == 4)
+sum_conf6 = sum(c for p, c in zip(pred, conf) if p == 5)
+sum_conf7 = sum(c for p, c in zip(pred, conf) if p == 6)
+
+av1 = sum_conf1/c1 if c1>0 else 0
+av2 = sum_conf2/c2 if c2>0 else 0
+av3 = sum_conf3/c3 if c3>0 else 0
+av4 = sum_conf4/c4 if c4>0 else 0
+av5 = sum_conf5/c5 if c5>0 else 0
+av6 = sum_conf6/c6 if c6>0 else 0
+av7 = sum_conf7/c7 if c7>0 else 0
+
+
+length = len(dataset)
+
+print(data3, ':')
+print(str1, c1, "/", length, '=', c1/length, "with confidence ", av1)
+print(str2, c2, "/", length, '=', c2/length, "with confidence ", av2)
+print(str3, c3, "/", length, '=', c3/length, "with confidence ", av3)
+print(str4, c4, "/", length, '=', c4/length, "with confidence ", av4)
+print(str5, c5, "/", length, '=', c5/length, "with confidence ", av5)
+print(str6, c6, "/", length, '=', c6/length, "with confidence ", av6)
+print(str7, c7, "/", length, '=', c7/length, "with confidence ", av7)
+print("\n")
+##########################################################################################################################################################################################
+
+
diff --git a/open_lm/eval3_prop_2048.py b/open_lm/eval3_prop_2048.py
new file mode 100644
index 0000000..d269b36
--- /dev/null
+++ b/open_lm/eval3_prop_2048.py
@@ -0,0 +1,139 @@
+import torch
+from open_lm.params import parse_args
+import argparse
+from open_lm.model import test_classif_model
+import webdataset as wds
+from open_lm.data import get_wds_dataset
+from open_lm.data import sample_chunk
+
+args = parse_args([])
+parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments")
+parser.add_argument('--model', type=str, help='Model name to use for evaluation')
+parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint')
+parser.add_argument('--str1', type=str, help='test set 1')
+parser.add_argument('--str2', type=str, help='test set 2')
+parser.add_argument('--str3', type=str, help='test set 3')
+parser.add_argument('--str4', type=str, help='test set 4')
+cmd_args = parser.parse_args()
+args.model = cmd_args.model
+args.classif_model_path = cmd_args.classif_model_path
+
+
+args.per_gpu_val_batch_size = 1
+args.vocab_size = 50432
+args.seq_len = 2047
+args.world_size = 1
+args.rank = 0
+
+###########################################################################################################
+
+args.num_classes = 4
+
+
+str1 = cmd_args.str1
+str2 = cmd_args.str2
+str3 = cmd_args.str3
+str4 = cmd_args.str4
+
+data1= "DCLM"
+data2= "Dolma"
+data3= "FWEdu"
+
+base_path = '/media/datasets/test_set/'
+
+data_path1 = base_path + data1 + '.tar'
+data_path2 = base_path + data2 + '.tar'
+data_path3 = base_path + data3 + '.tar'
+
+model = test_classif_model(args)
+model = model.to('cuda')
+
+###########################################################################################################
+
+args.val_data = [data_path1]
+dataset = get_wds_dataset(args, is_train=False, epoch=0, floor=True, tokenizer=None, data_key="txt", force_num_samples=None)
+dataloader = dataset.dataloader
+
+
+pred = []
+for sample in dataloader:
+    (texts,) = sample
+    inputs = torch.LongTensor(texts).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(inputs)
+        
+        pred.append( torch.argmax(out,2)[:,-1].item() )
+        
+c1 = pred.count(0)
+c2 = pred.count(1)
+c3 = pred.count(2)
+c4 = pred.count(3)
+
+length = 4096
+
+print(data1, ':')
+print(str1, c1, "/", length, '=', c1/length)
+print(str2, c2, "/", length, '=', c2/length)
+print(str3, c3, "/", length, '=', c3/length)
+print(str4, c4, "/", length, '=', c4/length)
+
+##########################################################################################################################################################################################
+
+args.val_data = [data_path2]
+dataset = get_wds_dataset(args, is_train=False, epoch=0, floor=True, tokenizer=None, data_key="txt", force_num_samples=None)
+dataloader = dataset.dataloader
+
+
+pred = []
+for sample in dataloader:
+    (texts,) = sample
+    inputs = torch.LongTensor(texts).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(inputs)
+        
+        pred.append( torch.argmax(out,2)[:,-1].item() )
+        
+c1 = pred.count(0)
+c2 = pred.count(1)
+c3 = pred.count(2)
+
+length = 4096
+
+print(data2, ':')
+print(str1, c1, "/", length, '=', c1/length)
+print(str2, c2, "/", length, '=', c2/length)
+print(str3, c3, "/", length, '=', c3/length)
+
+##########################################################################################################################################################################################
+
+args.val_data =  [data_path3]
+dataset = get_wds_dataset(args, is_train=False, epoch=0, floor=True, tokenizer=None, data_key="txt", force_num_samples=None)
+dataloader = dataset.dataloader
+
+
+pred = []
+for sample in dataloader:
+    (texts,) = sample
+    inputs = torch.LongTensor(texts).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(inputs)
+        
+        pred.append( torch.argmax(out,2)[:,-1].item() )
+        
+c1 = pred.count(0)
+c2 = pred.count(1)
+c3 = pred.count(2)
+
+length = 4096
+
+print(data3, ':')
+print(str1, c1, "/", length, '=', c1/length)
+print(str2, c2, "/", length, '=', c2/length)
+print(str3, c3, "/", length, '=', c3/length)
+
+##########################################################################################################################################################################################
+
+
diff --git a/open_lm/eval3_varylength.py b/open_lm/eval3_varylength.py
new file mode 100644
index 0000000..e6f1c5c
--- /dev/null
+++ b/open_lm/eval3_varylength.py
@@ -0,0 +1,118 @@
+import torch
+from open_lm.params import parse_args
+import argparse
+from open_lm.model import test_classif_model
+
+args = parse_args([])
+parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments")
+parser.add_argument('--model', type=str, help='Model name to use for evaluation')
+parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint')
+parser.add_argument('--str1', type=str, help='test set 1')
+parser.add_argument('--str2', type=str, help='test set 2')
+parser.add_argument('--str3', type=str, help='test set 3')
+cmd_args = parser.parse_args()
+args.model = cmd_args.model
+args.classif_model_path = cmd_args.classif_model_path
+
+
+
+###########################################################################################################
+
+args.num_classes = 3
+
+#Dolma_gen.pt
+#DCLM_gen.pt
+#FWEdu_gen.pt
+
+#'C4.pt'
+#'FineWeb.pt'
+#'RefinedWeb.pt'
+
+
+str1 = cmd_args.str1
+str2 = cmd_args.str2
+str3 = cmd_args.str3
+
+base_path = '/media/datasets/test_set/'
+
+data_path1 = base_path + str1 + '.pt'
+data_path2 = base_path + str2 + '.pt'
+data_path3 = base_path + str3 + '.pt'
+
+model = test_classif_model(args)
+model = model.to('cuda')
+
+###########################################################################################################
+
+dataset = torch.load(data_path1)
+n_bins = len(dataset)
+sum = torch.zeros(n_bins, dtype=torch.int)
+
+for i in range(n_bins):
+    n_samples = len(dataset[i])
+    for j in range(n_samples):
+        sample = torch.LongTensor(dataset[i][j]).to('cuda')
+        with torch.no_grad():
+            out, _, _ = model(sample)
+            pred = torch.argmax(out,2)[:,-1]
+            
+            if pred == 0:
+                sum[i] +=1
+
+
+sum1 = sum
+len1 = n_samples
+print(str1, sum1)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path2)
+n_bins = len(dataset)
+sum = torch.zeros(n_bins, dtype=torch.int)
+
+for i in range(n_bins):
+    n_samples = len(dataset[i])
+    for j in range(n_samples):
+        sample = torch.LongTensor(dataset[i][j]).to('cuda')
+        with torch.no_grad():
+            out, _, _ = model(sample)
+            pred = torch.argmax(out,2)[:,-1]
+            
+            if pred == 1:
+                sum[i] +=1
+
+sum2 = sum
+len2 = n_samples
+print(str2, sum2)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path3)
+n_bins = len(dataset)
+sum = torch.zeros(n_bins, dtype=torch.int)
+
+for i in range(n_bins):
+    n_samples = len(dataset[i])
+    for j in range(n_samples):
+        sample = torch.LongTensor(dataset[i][j]).to('cuda')
+        with torch.no_grad():
+            out, _, _ = model(sample)
+            pred = torch.argmax(out,2)[:,-1]
+            
+            if pred == 2:
+                sum[i] +=1
+
+sum3 = sum
+len3 = n_samples
+print(str3, sum3)
+
+##########################################################################################################################################################################################
+
+total_sum = sum1+sum2+sum3
+total_len = len1+len2+len3
+
+print(len1,len2,len3,"\n")
+    
+for i in range(n_bins):
+    print("Accuracy at bin ", i, " Seq. lengths range from ", i*200, " to ", i*200+200, " is: ", total_sum[i].item()/total_len * 100, "%")
+
diff --git a/open_lm/eval3_varylength_2000.py b/open_lm/eval3_varylength_2000.py
new file mode 100644
index 0000000..06ada50
--- /dev/null
+++ b/open_lm/eval3_varylength_2000.py
@@ -0,0 +1,124 @@
+import torch
+from open_lm.params import parse_args
+import argparse
+from open_lm.model import test_classif_model
+
+args = parse_args([])
+parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments")
+parser.add_argument('--model', type=str, help='Model name to use for evaluation')
+parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint')
+parser.add_argument('--str1', type=str, help='test set 1')
+parser.add_argument('--str2', type=str, help='test set 2')
+parser.add_argument('--str3', type=str, help='test set 3')
+cmd_args = parser.parse_args()
+args.model = cmd_args.model
+args.classif_model_path = cmd_args.classif_model_path
+
+
+
+###########################################################################################################
+
+args.num_classes = 3
+
+#Dolma_gen.pt
+#DCLM_gen.pt
+#FWEdu_gen.pt
+
+#'C4.pt'
+#'FineWeb.pt'
+#'RefinedWeb.pt'
+
+
+str1 = cmd_args.str1
+str2 = cmd_args.str2
+str3 = cmd_args.str3
+
+base_path = '/media/datasets/test_set/'
+
+data_path1 = base_path + str1 + '.pt'
+data_path2 = base_path + str2 + '.pt'
+data_path3 = base_path + str3 + '.pt'
+
+
+###########################################################################################################
+
+model = test_classif_model(args)
+model = model.to('cuda')
+
+
+indices = torch.arange(0, 2048, 200)
+
+sum = torch.zeros(len(indices))
+
+dataset = torch.load(data_path1)
+
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,indices]
+        
+        n_correct = torch.sum(pred == 0, dim=0)
+        
+        sum = sum + n_correct.cpu()
+
+sum1 = sum
+len1 = len(dataset)
+print(str1, sum1)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path2)
+
+sum = torch.zeros(len(indices))
+
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,indices]
+        
+        n_correct = torch.sum(pred == 1, dim=0)
+        
+        sum = sum + n_correct.cpu()
+
+sum2 = sum
+len2 = len(dataset)
+print(str2, sum2)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path3)
+
+sum = torch.zeros(len(indices))
+
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,indices]
+        
+        n_correct = torch.sum(pred == 2, dim=0)
+        
+        sum = sum + n_correct.cpu()
+
+sum3 = sum
+len3 = len(dataset)
+print(str3, sum3)
+
+##########################################################################################################################################################################################
+
+total = sum1+sum2+sum3
+
+print(len1,len2,len3,"\n")
+    
+for i in range(len(indices)):
+    print("Accuracy at token", indices[i], "=", total[i].item()/(len1+len2+len3))
+    
+
diff --git a/open_lm/eval4.py b/open_lm/eval4.py
new file mode 100644
index 0000000..a732bbf
--- /dev/null
+++ b/open_lm/eval4.py
@@ -0,0 +1,139 @@
+import torch
+from open_lm.params import parse_args
+import argparse
+from open_lm.model import test_classif_model
+
+args = parse_args([])
+parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments")
+parser.add_argument('--model', type=str, help='Model name to use for evaluation')
+parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint')
+parser.add_argument('--str1', type=str, help='test set 1')
+parser.add_argument('--str2', type=str, help='test set 2')
+parser.add_argument('--str3', type=str, help='test set 3')
+parser.add_argument('--str4', type=str, help='test set 4')
+cmd_args = parser.parse_args()
+args.model = cmd_args.model
+args.classif_model_path = cmd_args.classif_model_path
+
+
+
+###########################################################################################################
+
+args.num_classes = 4
+
+#Dolma_gen.pt
+#DCLM_gen.pt
+#FWEdu_gen.pt
+
+#'C4.pt'
+#'FineWeb.pt'
+#'RefinedWeb.pt'
+
+
+str1 = cmd_args.str1
+str2 = cmd_args.str2
+str3 = cmd_args.str3
+str4 = cmd_args.str4
+
+base_path = '/media/datasets/test_set/'
+
+data_path1 = base_path + str1 + '.pt'
+data_path2 = base_path + str2 + '.pt'
+data_path3 = base_path + str3 + '.pt'
+data_path4 = base_path + str4 + '.pt'
+
+
+###########################################################################################################
+
+model = test_classif_model(args)
+model = model.to('cuda')
+
+
+
+dataset = torch.load(data_path1)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 0).item()
+        
+        sum = sum + n_correct
+
+sum1 = sum
+len1 = len(dataset)
+print(str1, sum1, "/" , len1)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path2)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 1).item()
+        
+        sum = sum + n_correct
+
+sum2 = sum
+len2 = len(dataset)
+print(str2, sum2, "/" , len2)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path3)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 2).item()
+        
+        sum = sum + n_correct
+
+sum3 = sum
+len3 = len(dataset)
+print(str3, sum3, "/" , len3)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path4)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 3).item()
+        
+        sum = sum + n_correct
+
+sum4 = sum
+len4 = len(dataset)
+print(str4, sum4, "/" , len4)
+
+##########################################################################################################################################################################################
+
+total_sum = sum1+sum2+sum3+sum4
+total_length = len1+len2+len3+len4
+
+print("Total= ", total_sum, "/" , total_length ) 
+print("Accuracy= ", total_sum/total_length * 100, "%")
+    
+
diff --git a/open_lm/eval5.py b/open_lm/eval5.py
new file mode 100644
index 0000000..681223c
--- /dev/null
+++ b/open_lm/eval5.py
@@ -0,0 +1,162 @@
+import torch
+from open_lm.params import parse_args
+import argparse
+from open_lm.model import test_classif_model
+
+args = parse_args([])
+parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments")
+parser.add_argument('--model', type=str, help='Model name to use for evaluation')
+parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint')
+parser.add_argument('--str1', type=str, help='test set 1')
+parser.add_argument('--str2', type=str, help='test set 2')
+parser.add_argument('--str3', type=str, help='test set 3')
+parser.add_argument('--str4', type=str, help='test set 4')
+parser.add_argument('--str5', type=str, help='test set 5')
+cmd_args = parser.parse_args()
+args.model = cmd_args.model
+args.classif_model_path = cmd_args.classif_model_path
+
+
+
+###########################################################################################################
+
+args.num_classes = 5
+
+#Dolma_gen.pt
+#DCLM_gen.pt
+#FWEdu_gen.pt
+
+#'C4.pt'
+#'FineWeb.pt'
+#'RefinedWeb.pt'
+
+
+str1 = cmd_args.str1
+str2 = cmd_args.str2
+str3 = cmd_args.str3
+str4 = cmd_args.str4
+str5 = cmd_args.str5
+
+base_path = '/media/datasets/test_set/'
+
+data_path1 = base_path + str1 + '.pt'
+data_path2 = base_path + str2 + '.pt'
+data_path3 = base_path + str3 + '.pt'
+data_path4 = base_path + str4 + '.pt'
+data_path5 = base_path + str5 + '.pt'
+
+
+###########################################################################################################
+
+model = test_classif_model(args)
+model = model.to('cuda')
+
+
+
+dataset = torch.load(data_path1)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 0).item()
+        
+        sum = sum + n_correct
+
+sum1 = sum
+len1 = len(dataset)
+print(str1, sum1, "/" , len1)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path2)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 1).item()
+        
+        sum = sum + n_correct
+
+sum2 = sum
+len2 = len(dataset)
+print(str2, sum2, "/" , len2)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path3)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 2).item()
+        
+        sum = sum + n_correct
+
+sum3 = sum
+len3 = len(dataset)
+print(str3, sum3, "/" , len3)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path4)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 3).item()
+        
+        sum = sum + n_correct
+
+sum4 = sum
+len4 = len(dataset)
+print(str4, sum4, "/" , len4)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(data_path5)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 4).item()
+        
+        sum = sum + n_correct
+
+sum5 = sum
+len5 = len(dataset)
+print(str5, sum5, "/" , len5)
+
+##########################################################################################################################################################################################
+
+total_sum = sum1+sum2+sum3+sum4+sum5
+total_length = len1+len2+len3+len4+len5
+
+print("Total= ", total_sum, "/" , total_length ) 
+print("Accuracy= ", total_sum/total_length * 100, "%")
+    
+
diff --git a/open_lm/eval_redpajama_seq.py b/open_lm/eval_redpajama_seq.py
new file mode 100644
index 0000000..85fe453
--- /dev/null
+++ b/open_lm/eval_redpajama_seq.py
@@ -0,0 +1,167 @@
+import torch
+from open_lm.params import parse_args
+import argparse
+from open_lm.model import test_classif_model
+
+args = parse_args([])
+parser = argparse.ArgumentParser(description="Override params arguments with command-line arguments")
+parser.add_argument('--model', type=str, help='Model name to use for evaluation')
+parser.add_argument('--classif-model-path', type=str, help='Path to the classification model checkpoint')
+cmd_args = parser.parse_args()
+args.model = cmd_args.model
+args.classif_model_path = cmd_args.classif_model_path
+
+
+
+###########################################################################################################
+args.num_classes = 6
+
+path1 = "/media/datasets/RedPajama/val_seq/arxiv-shard-0000019.pt"
+path2 = "/media/datasets/RedPajama/val_seq/c4-shard-0000019.pt"
+path3 = "/media/datasets/RedPajama/val_seq/cc-shard-0000019.pt"
+path4 = "/media/datasets/RedPajama/val_seq/gh-shard-0000019.pt"
+path5 = "/media/datasets/RedPajama/val_seq/se-shard-0000019.pt"
+path6 = "/media/datasets/RedPajama/val_seq/wiki-shard-0000009.pt"
+
+str1 = "Arxiv"
+str2 = "C4"
+str3 =  "CC"
+str4 = "Github"
+str5 = "StackExchange"
+str6 = "Wikipedia"
+###########################################################################################################
+
+model = test_classif_model(args)
+model = model.to('cuda')
+
+
+
+dataset = torch.load(path1)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 0).item()
+        
+        sum = sum + n_correct
+
+sum1 = sum
+len1 = len(dataset)
+print(str1, sum1, "/" , len1)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(path2)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 1).item()
+        
+        sum = sum + n_correct
+
+sum2 = sum
+len2 = len(dataset)
+print(str2, sum2, "/" , len2)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(path3)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 2).item()
+        
+        sum = sum + n_correct
+
+sum3 = sum
+len3 = len(dataset)
+print(str3, sum3, "/" , len3)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(path4)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 3).item()
+        
+        sum = sum + n_correct
+
+sum4 = sum
+len4 = len(dataset)
+print(str4, sum4, "/" , len4)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(path5)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 4).item()
+        
+        sum = sum + n_correct
+
+sum5 = sum
+len5 = len(dataset)
+print(str5, sum5, "/" , len5)
+
+##########################################################################################################################################################################################
+
+dataset = torch.load(path6)
+sum = 0
+for sample in dataset:
+    sample = torch.LongTensor(sample).to('cuda')
+    
+    with torch.no_grad():
+        out, _, _ = model(sample)
+        
+        pred = torch.argmax(out,2)[:,-1]
+        
+        n_correct = torch.sum(pred == 5).item()
+        
+        sum = sum + n_correct
+
+sum6 = sum
+len6 = len(dataset)
+print(str6, sum6, "/" , len6)
+
+##########################################################################################################################################################################################
+
+
+
+total_sum = sum1+sum2+sum3+sum4+sum5+sum6
+total_length = len1+len2+len3+len4+len5+len6
+
+print("Total= ", total_sum, "/" , total_length ) 
+print("Accuracy= ", total_sum/total_length * 100, "%")
+    
+
diff --git a/open_lm/extra_funcs.py b/open_lm/extra_funcs.py
deleted file mode 100644
index b746d5e..0000000
--- a/open_lm/extra_funcs.py
+++ /dev/null
@@ -1,214 +0,0 @@
-import os
-import shutil
-import random
-import json
-import torch
-import numpy as np
-import subprocess
-
-from open_lm.params import parse_args
-from open_lm.model import test_classif_model
-
-def inference():
-    
-    args = parse_args([])
-    args.model = "open_lm_25m"
-    args.classif_model_path = "/workspace/youssef/lrz/logs/RedPajama/prop/checkpoints/epoch_1.pt"
-    args.num_classes = 2
-     
-    test_data_path = '/workspace/youssef/lrz/datasets/prop/Llama1_gen.pt'
-    dataset = torch.load(test_data_path)
-    
-    model = test_classif_model(args)
-    model = model.to('cuda:3')
-       
-    pred = []
-    for sample in dataset:
-        sample = torch.LongTensor(sample).to('cuda:3')
-        with torch.no_grad():
-            out, _, _ = model(sample)        
-            pred.append(torch.argmax(out,2)[:,-1].item())
-               
-    c1 = pred.count(0)
-    c2 = pred.count(1)
-
-    print(c1,c2)
-
-    if c2 > c1:
-        return 1
-    else:
-        return 0
-
-def train_classifier(cuda_devices="3", log_dir="/workspace/youssef/lrz/logs/RedPajama/prop"):
-    # Set the CUDA_VISIBLE_DEVICES environment variable
-    os.environ["CUDA_VISIBLE_DEVICES"] = cuda_devices
-    
-    # Generate a random master port between 10000 and 65000
-    master_port = random.randint(10000, 65000)
-
-    # Construct the torchrun command
-    command = [
-        "torchrun",
-        f"--master_port={master_port}",
-        "--nproc-per-node", "1",
-        "-m", "open_lm.main",
-        "--model", "open_lm_25m",
-        "--dataset-manifest", "/workspace/youssef/lrz/datasets/prop/train/manifest.jsonl",
-        "--train-num-samples", "200000000",
-        "--workers", "1",
-        "--precision", "amp_bfloat16",
-        "--grad-checkpointing",
-        "--log-every-n-steps", "100",
-        "--grad-clip-norm", "1",
-        "--global-batch-size", "16",
-        "--data-key", "txt",
-        "--lr", "3e-4",
-        "--warmup", "2000",
-        "--wd", "0.1",
-        "--beta2", "0.95",
-        "--epochs", "1",
-        "--resume", "latest",
-        "--logs", "/workspace/youssef/lrz/logs/RedPajama/",
-        "--name", "prop",
-        "--classification", "True",
-        "--num-classes", "2",
-        "--classif-model-path", "/workspace/youssef/lrz/logs/pretrain/25M_0.5BC4/checkpoint/epoch_1.pt"
-    ]
-
-    os.makedirs(log_dir, exist_ok=True)
-
-    # Create log file paths
-    stdout_log = os.path.join(log_dir, "output.log")
-    stderr_log = os.path.join(log_dir, "error.log")
-
-    # Run the torchrun command using subprocess
-    with open(stdout_log, "w") as out_file, open(stderr_log, "w") as err_file:
-        try:
-            result = subprocess.run(command, check=True, stdout=out_file, stderr=err_file)
-            print(f"torchrun finished with return code: {result.returncode}")
-        except subprocess.CalledProcessError as e:
-            print(f"An error occurred while running torchrun: {e}")
-
-
-
-def proj_simplex(y):
-    m = len(y)
-    bget = False
-    s = sorted(y, reverse=True) # sorting in descending order
-    tmpsum = 0
-    for i in range(m-1):
-        tmpsum = tmpsum + s[i]
-        tmax = (tmpsum - 1) / (i+1)
-        if tmax >= s[i+1]:
-            bget = True
-            break
-    if not bget:
-        tmax = (tmpsum + s[m-1] -1) / m
-    return np.maximum(y-tmax,0)
-
-
-
-def del_dir(dir_path):
-    try:
-        # Remove the directory and all its contents
-        shutil.rmtree(dir_path)
-        print(f"Removed directory: {dir_path}")
-    except FileNotFoundError:
-        print(f"Directory not found: {dir_path}")
-    except PermissionError:
-        print(f"Permission denied: {dir_path}")
-    except Exception as e:
-        print(f"An error occurred while removing the directory: {e}")
-
-
-def round_preserving_sum(numbers):
-    """
-    This function takes a list of numbers that add up to 1, multiplies each by 100,
-    rounds them to integers while preserving the sum as 100.
-    """
-    # Step 1: Multiply all numbers by 100
-    multiplied = np.array(numbers) * 100
-
-    # Step 2: Separate integer and decimal parts
-    integers = np.floor(multiplied).astype(int)  # Integer parts
-    decimals = multiplied - integers  # Decimal parts
-
-    # Step 3: Calculate the difference between the current sum and 100
-    current_sum = np.sum(integers)
-    difference = 100 - current_sum
-
-    # Step 4: Distribute the difference by rounding up the largest decimals
-    if difference > 0:
-        # Get indices of the largest decimals and round up those numbers
-        indices_to_round_up = np.argsort(-decimals)[:difference]
-        integers[indices_to_round_up] += 1
-
-    return integers.tolist()
-
-def sample_and_rename_files(sample_counts_list):
-
-    base_path = "/workspace/youssef/lrz/datasets/prop/original/"
-    output_folder = "/workspace/youssef/lrz/datasets/prop/train/"
-   
-    # Define the folder names in order
-    file_names = ['arxiv', 'c4', 'cc', 'github', 'se', 'wiki']
-    folder_names = [os.path.join(base_path, folder) for folder in file_names]
-    
-    # Check if the provided sample_counts_list contains exactly two lists
-    if len(sample_counts_list) != 2 or any(len(sample_counts) != 6 for sample_counts in sample_counts_list):
-        raise ValueError("sample_counts_list must contain exactly two lists, each with 6 numbers.")
-    
-    # Create the output folder if it doesn't exist
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-    
-    # List to store the manifest data
-    manifest_data = []
-
-    # Loop over the two lists of sample counts
-    for index, sample_counts in enumerate(sample_counts_list):
-        # Iterate through each folder and sample the required number of .tar files
-        for i, folder in enumerate(folder_names):
-            folder_path = os.path.join(folder)
-            
-            if not os.path.exists(folder_path):
-                raise ValueError(f"Folder {folder_path} does not exist.")
-            
-            # Get all .tar files from the current folder
-            all_files = [f for f in os.listdir(folder_path) if f.endswith('.tar')]
-            
-            # Ensure the sample count is not more than available files
-            sample_count = min(sample_counts[i], len(all_files))
-            
-            # Randomly sample the required number of files from the folder
-            sampled_files = random.sample(all_files, sample_count)
-            
-            # Copy each sampled file to the output folder with the new name
-            for file_name in sampled_files:
-                # Construct source file path
-                source_file_path = os.path.join(folder_path, file_name)
-                
-                # Create the new filename by prepending the index (0 or 1) with a dash
-                new_file_name = f"{index}-{file_name[:-4]}"  # Remove the .tar extension
-                
-                # Destination path in the output folder
-                dest_file_path = os.path.join(output_folder, new_file_name + '.tar')  # Keep .tar in destination
-                
-                # Copy the file to the output folder
-                shutil.copy2(source_file_path, dest_file_path)
-                
-                # Add entry to manifest_data, replacing ".tar" in new_file_name with an empty string
-                manifest_entry = {
-                    "shard": new_file_name,  # No .tar extension
-                    "num_sequences": 489  # Set a fixed number of sequences
-                }
-                manifest_data.append(manifest_entry)
-
-    # Write the manifest.jsonl file
-    manifest_file_path = os.path.join(output_folder, "manifest.jsonl")
-    with open(manifest_file_path, 'w') as manifest_file:
-        # Write each entry except the last one with a newline
-        for entry in manifest_data:
-            manifest_file.write(json.dumps(entry) + '\n')
-
-    print(f"Files sampled and saved in {output_folder}. Manifest file created as {manifest_file_path}.")
\ No newline at end of file
diff --git a/open_lm/hf/__init__.py b/open_lm/hf/__init__.py
new file mode 100644
index 0000000..8493168
--- /dev/null
+++ b/open_lm/hf/__init__.py
@@ -0,0 +1,3 @@
+from .configuration_openlm import OpenLMConfig
+from .modeling_openlm import OpenLMForCausalLM
+from .tokenization_openlm import OpenLMTokenizerFast
diff --git a/open_lm/hf/__pycache__/__init__.cpython-310.pyc b/open_lm/hf/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72a80db20c9a931ace21be1fd1eb1076b71a4f27
GIT binary patch
literal 319
zcmd1j<>g`kg1x`bq)i6Wk3k${zy#zt0CBMfkVs)jVa#F3WsG8E1hJWNm~xq;n89r3
z9F|<xC{`ewC73~z^(7-vxhCT+9{+;WJRe`@{JgZxbRb_4!gtFra!xEQPR#M~1qup7
z1Vi$(Q}Z&bQj6RYi%a}8*`h>}!P-iT5=%1k^WyV?M&;xdfm{^DpPQeOnv<EA4ijYt
zi3*oMj6hM(0+d<FP{azPz{D>@{qp>x?BasN<W&92{L<p$)HHpN+3`8Ka5^JRKR!M)
hFS8^*Uaz3?7KaVczS5jjJCL`E*?<HO0}rDBBLL)QR~G;P

literal 0
HcmV?d00001

diff --git a/open_lm/hf/__pycache__/configuration_openlm.cpython-310.pyc b/open_lm/hf/__pycache__/configuration_openlm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29dba335218cf4e27b8e960b6ea7d5e7b4f52ffa
GIT binary patch
literal 854
zcmY*X&2H2%5VrHb=`L(5=(*Ru?23Cqph|FQOC=-@zF1c5q@hl-*omsGdTXWLc!pNu
z(ud#;cmiKJ@d|KZCW)%Wk|#6Y#53PFb~Br$1lEtgKl4`}A-^!#UI7GeVcXAOBw<8S
z>1kSd%&UCnQ;dBbR3Qs{KhRMXvzQVU-Y07EKw0{M$WTVNL`KTL@!5{@?x!+VI}aY5
zNIsFp<Z3_txK{P?$wAiCd0BKuv}IGLzo92tcrJa{warnVa`Q8zS|iF@$pKkV7we`d
z%DMnIIuk}zHp7$K%O3-K0FEMzf)-Cw=HHPV6iNR!VZj{<2n&Hx;HCqiv1!x*JzZsV
zt0YVdekWuHc3)M}f?h+dL(-BP@&)Enl56^%exsn1VAWH%JCuFjm?P0yp^s14OqlGv
zY*p5FM%hQy8`^HIj7vB#>$2q>puj4fyXfLln8H3%H{m?f!dlMF1PJ_jy}E47g<XqG
ztu{?(t;$zOn(ON6d!DcUn=c<!E!P`^BKA-SDoW`-y?Qw?9R{#`ViF;%T2zX2mvUY;
zveW2KIse)TJ!Du4xYE4cfDA0{W+UB9X+2Wzd7EdDHFGo(nCGx9ItdBr3$Mpa4Dsn=
z=3>relVLBo|Jvq_sg$t>@4?{pvb}A5*|nv%2KBJnIEiIX%Y=Zrv`{U_ccYBiMIagj
nZ0{1Iiqf`_!|84rr0SBxLH8Sc4;Cc^-aMGnsh4_le?IvKM$p??

literal 0
HcmV?d00001

diff --git a/open_lm/hf/__pycache__/modeling_openlm.cpython-310.pyc b/open_lm/hf/__pycache__/modeling_openlm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84b00fbcefa4508866c19f33a5e5d6f73de281e6
GIT binary patch
literal 5904
zcma)ATXP&o6`tvtoxQK5wQNhiPMi>X9Y;=82uYbFKsh!^Vao<91cu9KwB0L>JUg@L
znYAp{E~H{p;U)ZnRAEy*;1~V?#XsPI=YHkMk3hmDH}jpIU1?>>z;4Z%)2B~QpFVwV
z-ATP(G4T82XMgH^{kUQLg&LDT8;x(`N&gJM4Q@t;M{gFHUFI>9a4WL9wr6(>UZLxF
zPPgb4HQkI#UJ3nnRPI*1il!B!YPaUqH10(8Zo_M6yckV&r@d*7m!g?&(`$AQcn5U5
z9L;tQdIvRLi4JuSdxwEnc`cgj&U^EwA&h%R#L@evcZ}D?e24J{pSo*%$G>gxX+Cq;
z;4`9f$MQ~y>3b)|yf|^6<Th^@t>#0l%2;ff%ISn6;%UoP_LVTrROwnT3zImARAH&#
zi-an?9it7-sq0cK$smjcznX9nfmFB{wATcXLl=X78bnvFUh8MQe)dk7tz8e&3_Y{o
z>Su}OznsW1SBoj<dLV;ts;c;8oOTk~6%y=?Rgw8IT7EkU(iD_BQGzUs{Z2n_KQtjX
zsI_Z75ns7_F^N0js&d-;-KNg*k2a8=i+Iu)AT)+%W@P3a^9|!CGeefKfiYm~)*U7f
zXZ9E$m_&Qkc-P5HZimLe{H^&bb7&DaH^F;0D-4Wv=Z-nBd`D9ms4rzjP)nLx9#~Lu
zVe3f6#qaGbjKXZwCFx1CnyAWpjoD^TsIp#M9CXFv1EW=u46@29fGVy9DLSRHsaF-D
z32MqZaWn}u2pk~Z{3|p`%a$`lac*t|ay3;2y~C>3rA_m5P03jrNZ`{~&Tl00X4(td
z;{0aPPgBu3pFp{O)ZKosb<XS2VZ7>7U({XbZK|_vDS}M+lOOgwGU@s~DYD#BM)MuQ
zOeq1oP%~?0(`4^8ScTP0dTb`)fkqq8uHVs7E9KZ%VR$AtJjPAW61G?1%yak|ZVQ_;
zQ52<lqho(ybWC30&fU6KE?fKkd5afCWxdKv7*koV%^SR|u{yBIHdY1J$ooOBX?hh{
z9oQ7!jnR8*^qwBQXYigze^bnerZ~`HkZO(};InrN-Ylq7aM6SO5M1<N*@E=BEctBB
zd4tx#5A!*29|Ct}{qXu6M$Gdg7%|U}Zc~o&<DeYjC-}*`hIe$L$K_9<=NLc5pGMDd
z%yO1L1F23xs;`XR`Ik?29R4gnjk(+p;3(cxu-WI-Ozu|`zPfc@TkNIVnG{_SZMrLg
z6e~$AQuocvuHJihLk7K`kPEqO7c_BETVdOp*iypBPx>kvnQ_OsXAaE!rU_R}=iyFp
z&w=4v9B|$8ZS8E9ra9cW0WS1vti=|U)$a9MMO8@qm|c~8Ka9i7_q91mG9Rz1DO}tN
z+d|erktYFKma<b3b+m|@^1xJOKaV*Wuf5|Kai0Qs5l>2E$y6(5iA|doYwO4!O)ZS)
zSLHB96!b%$s=7u~UvyUlnhiLxh$#;E-5|X=*<J~QR892|HvOAo)4vr&eUU0B3RXmv
zs&YRSew)Irnn`q!rJ$I9Dpj*1(ptzlTsK9Wz}PC>urFhuhwV%?^kBc2q#;eOoV>G5
z3mg9^k~nxS;z>US7%*=1O<B&cO+4QwTT_~v!t}-!K{4bUw}!<*NuJ0Ux9^zmn}f1^
z26#c^*vxesvDF$B21V|CXb)?H8ZXWp!#dvZ-eG|o!v>zI0k#)&%0Zo%kwh!MfDGLD
z1?yPz#=XLQV>k_tGSVQ?;9;6JlR4|fdnNQX2h;b;yaMxqpWkQTo}u=Ab2tO6Hke+o
zYaD!onLBJal{E%agF57D@Y;tqyr90g_42M=bHmh4Mdq&bGdJILmnX3QILTa?UN4cE
z8)ojN$hKbI-V~qqlScku1DCd5*t4rNn5MvY$I}SzhVe*-trz#(^#3sk$>CY-t2{&C
zEP-<bC~-cd#Ix}55q@$CD3t_Jl9CA~t6`REc2(bHVy!7<p;r}=YGPyuRrx-d?IkG_
zsr^b7k=oJ8RRx(R3s?I{F{+Ryvc0BQ9INSz5_4RNGnw=@uh4Y%tx#-e?^TssNjq5a
z({M{DmL<v=IdYlAJ0PFND__oO!t|P2gz1GnN0?sS^MoxBI8WdO09DMDm&%uj^fdxs
z1@LO=TG+|_e97A_?So#4)Zus9*-rYDI!au?$Zz0D-vB60BSx6?*!UAHnU2HE{O^2T
zD(W6qGLNuXBtS$Hvxd0B*bdLu{NCs?Qsk~8hxUyga@fC9wWF&$wz#Avd|>7((&217
zrm0#xi2YR&3yCN(#GsUR-Zr;R?45LybyqAeVhDwb@6h;P0*pe%D;Z%X(hXb_#TJvZ
zJMEzzmq#sdtTW;nA}r+Cz`kkA(*t{0h>K$yXLAS*1EhX(_W^=L?({=s4deprp2aP0
z-$$7F$Qn8W2kz@EYID{$toMJZW#v|~ZSOGcN|=T*;$++w$_nEQNn|ZRHD>jaUaL4^
zT6)kt7O1K^Hb){}&DOkGWW(f^pi-w3-u83qxX8+*$75Eaj)S*{<cTbW>;OYb<aiqp
zkl!Ok9swv>O|!w8cEfVahBeEW#b6=M7j|Lm{N4@vuit#4t)wMPgb-=0<23NVTztTC
zZ*64B%{*0ccylT*f$i~S%u+$FmoUNk5lK&GR5G`o-7DpOLsUVhxHgR%{|Ki?Cm_m9
z*kwrRWB+d$auEc14FKY5-~S#F9<%OQMCv`_{Thr~yxP9v?KkAfx|_qta^1-i-Z(Cx
zE}=J)hnDd{ArD`*4H2%cWq!5^WoiY>>lmw?JofCX-Gij6QDE0KgWibWko@FsB?OZa
z+d93!iua6rvT|K~Y_y1qRx(4;MFkcqocYM|V}-vBPI-gCW12olYTGmA@4z_{6{Y^Z
zqV7BHiTailR9h_g1csptOgdFvOnM=bA)PkdYgeu&?inu9whZ;*)kVdWY_t~Kx7^%I
zku@_nh`F1s36Ro$l%?)^KSfFxZ5*xK)j+NUtKyuSCg4Hdg{$l29Sgw)&kI|ccZoX3
zM>)#12pxG57DT9xBhkS`sT&}1KcDn=#=WC0!PS;P!`2{8gav@NlSGg<y|BHu=iPVK
zM66}a%WFLis<t3PBIM_E&vK_jx?gsqU{gr?B=RfQrJa_9BP`*hj%qyxO!ChyPa5Ph
zjlk)-k2HYzvm@&bAuhPfVMiAqInTo6zzZmbqRs+Nc;kcnLchn6OU^7TEVP!XLRn6t
zZjzrFaZ+5~`SxgS?WBtk2(NJSL_1p3Mkbcsd@ATr3KvB$cX=p9JA-y1(qk*YvPB%M
zK$AODY>Tg@+*kg5Quj6Y@<IeH1@g>oZ2jYlf;6LbpK}*+;K`RpCbT@t#mjD2WNXR9
z<V$M;2_ca%B`{(IkT6bMG9x!jTyeV>h3$|QlZ8TW<aoKznzp>LaqTQ<F?YQEVsRB;
z@kd|yJvkl)PrJHxd~bZ1;8cxRZ1{1%>m%HYI91JFz=8W+I7Wy|A3y2rO7<4MIxaI(
zDl;<UeN%@zT*Vw%OtQDS7}bR%2(D{CWLzDfe*|9nV*-!Gj>GUm7;`SbL>~DAc=JVA
zTdlo|c*3Y$l<MNn52wS$)+ss?($$IYtBOze6n(7CS2f>%w;x2Ko}%ybq>WSGH0DNe
zq@5x^3VJi-KBEhcJkv{BetwcXMcAt}sB$HVSC^p4M5^lLC<(Gr!^z*06sYW#B#9(t
zUP+-`76?!&E2(~zmkC@Y@EU+}u4-}RcZsq@;0FYLNWdfTCV(nmqH`tX2>DZL*GGrg
zwuqwfK~}5&85*LC)f2#|Hco#AP(s~j+qQX(vBtRkoOoOI-)zRdELrS}lFdFZ71(Dr
zhkaTqvVS`z_AjT*{^?ZMKb$K2gw-(a5&O6@Rrzb<b!OX-Y_uz8<x%rk(?U_>lpNNu
zA331vhc&;HBumRxML6Ut)}X2gBZ0p86RExkQx5LPXmzB)T=!aST7g>kwTU>CN-9ge
zxk=JOe(puIiBhe6@A$569bz%VD3hgnN7U|5Xr6Q-z*VNKU<OrB@*A=R-J@^ov`y@E
z(EVPjZ|ZPfA2DyA4fVa9n$1mZ6n{q-Yq=GcQ3YIqPK1;`E&OEybV;T)rb<$*;<6UK
uJ8q>h+F(_B6;i>{UniGM$x2JOWt&6YKzDESpUyGdz%{JtgT~LN=l%ngJu1uq

literal 0
HcmV?d00001

diff --git a/open_lm/hf/__pycache__/tokenization_openlm.cpython-310.pyc b/open_lm/hf/__pycache__/tokenization_openlm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd3cc05fe64b54d1ea671e7d0dac39bc7420b4be
GIT binary patch
literal 525
zcmY*Wu};G<5Vf6zhANdRKw@Oh()>VG6(PX_1&K~sEUUh>p>bU7M5wH=@*%A3`~hF!
zm5E<~1v{xqImu_=o$uYfJK5`Xk(KwKkL-;g^y!RU@vSkoM~|%Hh~okYoMJ&zf}KqY
zPxz^i(FNij_n#5>fjoI>0QhbXd<b~(;(qSkqPX*AF;GlQ@3gMuR4$>+A3>Sp8wFFv
z@)G#gdCqhdV-sDkpuD^~lVz6AXD)qLur<c^=*TKSDds389NSXF9JZPN04K3$LP|>}
zfKt<?R7hSI?j2EjU$df(ydt%40UPx_Odh0K>J^(ovX-^hkR{SKMGLXL7g<uZY*>}c
zlDc>y2CKCSEKcY;baM=mPa2OUc9dCybdv>&&I+dWB<`4~QmoXOR05P%b|px8R;4GK
wqJu>?m^I_5)owNADm3UOR4~tV1?rDR`*N!`;jm>ra?{hUxju<V-#h630s!2JYybcN

literal 0
HcmV?d00001

diff --git a/open_lm/hf/configuration_openlm.py b/open_lm/hf/configuration_openlm.py
new file mode 100644
index 0000000..7566396
--- /dev/null
+++ b/open_lm/hf/configuration_openlm.py
@@ -0,0 +1,24 @@
+# Follows OLMo's HF template
+
+"""
+OpenLM configuration
+"""
+
+from transformers import AutoConfig, PretrainedConfig
+from transformers.utils import logging
+
+from open_lm.model import Params
+
+logger = logging.get_logger(__name__)
+
+
+class OpenLMConfig(PretrainedConfig):
+    model_type = "openlm"
+
+    def __init__(self, **kwargs):
+        kwargs["architectures"] = ["OpenLMForCausalLM"]
+        super().__init__(**kwargs)
+
+
+# Register the config class so that it is available for transformer pipelines, auto-loading etc.
+AutoConfig.register("openlm", OpenLMConfig)
diff --git a/open_lm/hf/modeling_openlm.py b/open_lm/hf/modeling_openlm.py
new file mode 100644
index 0000000..67ee1e4
--- /dev/null
+++ b/open_lm/hf/modeling_openlm.py
@@ -0,0 +1,194 @@
+# Follows OLMo's HF template
+
+import logging
+from dataclasses import fields
+from typing import List, Optional, Tuple, Union
+
+import torch
+from transformers import PreTrainedModel
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.auto import AutoModelForCausalLM
+
+from open_lm.model import Params, Transformer
+from open_lm.norms import get_norm_class
+from open_lm.attention import get_attn_func
+
+from .configuration_openlm import OpenLMConfig
+
+log = logging.getLogger(__name__)
+
+
+def create_model_config_from_pretrained_config(config: OpenLMConfig):
+    """
+    Utility function
+    """
+
+    kwargs = {}
+    for field in fields(Params):
+        if hasattr(config, field.name):
+            kwargs[field.name] = getattr(config, field.name)
+
+    model_config = Params(**kwargs)
+
+    if hasattr(config, "norm_type"):
+        model_config.norm_type = get_norm_class(config.norm_type)
+
+    if hasattr(config, "attn_name"):
+        model_config.attn_func = get_attn_func(config.attn_name)
+
+    return model_config
+
+
+class OpenLMForCausalLM(PreTrainedModel):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    config_class = OpenLMConfig
+    base_model_prefix = "model"
+
+    def __init__(self, config: OpenLMConfig, model: Optional[Transformer] = None):
+        super().__init__(config)
+
+        if not model:
+            self.model_config = create_model_config_from_pretrained_config(config)
+            # Initialize model (always on CPU to start with so we don't run out of GPU memory).
+            self.model_config.init_device = "cpu"
+            self.model = Transformer(self.model_config)
+
+        else:
+            self.model = model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[
+            Cache
+        ] = None,  # This is a hack mitigation of an issue in transformers `4.39.x` https://github.com/huggingface/transformers/issues/29426
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is not None:
+            log.warning("inputs_embeds is set but OpenLM does not support it yet")
+        if attention_bias is not None:
+            log.warning("attention_bias is et but OpenLM does not support it yet")
+        if use_cache is None:
+            use_cache = True
+        if output_attentions:
+            raise ValueError("output_attentions is not yet supported in OpenLM")
+        if output_hidden_states:
+            raise ValueError("output_hidden_states is not yet supported in OpenLM")
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        # print("outer past_key_values: ", type(past_key_values))
+        # if past_key_values is not None:
+        #     print(len(past_key_values), type(past_key_values[0]))
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+
+        logits = outputs[0]
+        past_key_values = outputs[2]
+        hidden_states = None
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = torch.nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.model_config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=past_key_values,
+            hidden_states=hidden_states,
+        )
+
+    def can_generate(self) -> bool:
+        return True
+
+    def prepare_inputs_for_generation(
+        self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values[0][1], int):
+                # This assumes that the second item of past key values is the length of the past (this is the case for linear attention)
+                past_length = past_key_values[0][1]
+            else:
+                # This assumes that the first item of past key values is a list of all the past keys, thus the
+                # shape 1 is the length of the past (this is the case for attention without window)
+                past_length = past_key_values[0][0].shape[1]
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        model_inputs = {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.pop("use_cache", True),
+        }
+        return model_inputs
+
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.tok_embeddings
+
+    def set_input_embeddings(self, value: torch.nn.Module):
+        self.model.tok_embeddings = value
+
+    def get_output_embeddings(self):
+        if self.model_config.weight_tying:
+            return self.model.tok_embeddings
+        else:
+            return self.model.output
+
+    def set_output_embeddings(self, value: torch.nn.Module):
+        if self.model_config.weight_tying:
+            self.model.tok_embeddings = value
+        else:
+            self.model.output = value
+
+    def tie_weights(self):
+        """
+        Copied from OLMo (description below). I removed it and the results just became garbage, so this pass is needed.
+        This function is intentionally left as a no-op.
+        Weight tying is handled as follows:
+        - When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
+        See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
+        - When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
+        See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
+        Therefore, there is no need to explicitly tie the weights in this function.
+        """
+        pass
+
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        raise NotImplementedError
+
+
+# Register the model so that it is available for transformer pipelines, auto-loading, etc.
+AutoModelForCausalLM.register(OpenLMConfig, OpenLMForCausalLM)
diff --git a/open_lm/hf/tokenization_openlm.py b/open_lm/hf/tokenization_openlm.py
new file mode 100644
index 0000000..e8abdd6
--- /dev/null
+++ b/open_lm/hf/tokenization_openlm.py
@@ -0,0 +1,18 @@
+# Follows OLMo's HF template
+
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
+
+from open_lm.hf.configuration_openlm import OpenLMConfig
+
+
+class OpenLMTokenizerFast(PreTrainedTokenizerFast):
+    # Note: OpenLM's tokenizer is already a wrapper around huggingface. This is potentially unnecessary.
+    pass
+
+    # def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+    #     # This is required to make the implementation complete.
+    #     pass
+
+
+# Register the tokenizer class so that it is available for transformer pipelines, auto-loading etc.
+AutoTokenizer.register(OpenLMConfig, fast_tokenizer_class=OpenLMTokenizerFast)
diff --git a/open_lm/infer_proportions.py b/open_lm/infer_proportions.py
deleted file mode 100644
index 1dadd64..0000000
--- a/open_lm/infer_proportions.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import torch
-import numpy as np
-
-from extra_funcs import train_classifier, proj_simplex, round_preserving_sum, sample_and_rename_files, inference, del_dir
-
-def comparison(x, xcandidate):
-
-    list1 = round_preserving_sum(x.tolist())
-    list2 = round_preserving_sum(xcandidate.tolist())
-    list = [list1, list2]
-
-    sample_and_rename_files(list)
-
-    train_classifier()
-
-    result = inference()
-
-    del_dir("/workspace/youssef/lrz/logs/RedPajama/prop")
-    del_dir("/workspace/youssef/lrz/datasets/prop/train")
-
-    return result
-
-
-def gradientless_descent(N=6, num_iter=200, radius = 0.2, alpha=0.5):
-
-    #For measuring error
-    xorig = np.array([0.0325,0.1575,0.6775,0.0525,0.0275,0.0525])
-    
-    # initialize x with equal probability 
-    x = np.ones(N)/N
-    
-    error = []
-    prop = []
-    
-    for i in range(num_iter):
-        
-        stepsize = 1/(i+1)**alpha
-        # choose random direction with radius R
-        dir = np.random.randn(N)
-        dir = dir/np.linalg.norm(dir)*radius*stepsize
-        
-        xcandidate = proj_simplex( x + dir )
-        
-        # compare x with x+dir and update x
-        if comparison(x, xcandidate) == 1:
-            x = xcandidate
-
-        print(i, np.linalg.norm(x-xorig), x)
-        error.append(np.linalg.norm(x-xorig))
-        prop.append(x)
-
-        torch.save(error, "error.pt")
-        torch.save(prop, "prop.pt")
-    return x
-
-if __name__ == "__main__":
-    gradientless_descent()
diff --git a/open_lm/main2.py b/open_lm/main2.py
deleted file mode 100644
index 55863f8..0000000
--- a/open_lm/main2.py
+++ /dev/null
@@ -1,1034 +0,0 @@
-import atexit
-import logging
-import os
-import re
-import sys
-import random
-from datetime import datetime
-import functools
-import numpy as np
-from pathlib import Path
-import json
-import traceback
-
-import fsspec
-import torch
-from torch import optim
-from torch.cuda.amp import GradScaler
-
-import torch.distributed as dist
-
-from open_lm.data import sample_chunk
-
-from torch.distributed.fsdp import (
-    FullyShardedDataParallel as FSDP,
-    MixedPrecision,
-    BackwardPrefetch,
-    ShardingStrategy,
-    FullStateDictConfig,
-    StateDictType,
-    CPUOffload,
-)
-from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
-
-from open_lm.data import proc_token
-from open_lm.model import Block
-from open_lm.losses import CrossEntropyLossWithZLoss
-from open_lm.utils.averaging_utils import ModelAverager
-
-try:
-    import wandb
-except ImportError:
-    wandb = None
-
-try:
-    import torch.utils.tensorboard as tensorboard
-except ImportError:
-    tensorboard = None
-
-from open_lm.model import create_model
-from open_lm.model import create_classif_model
-
-from open_lm.utils.transformers.hf_wrapper import create_wrapped_hf_model
-from open_lm.data import get_data, get_wds_dataset
-from open_lm.distributed import is_master, init_distributed_device, broadcast_object
-from open_lm.logger import setup_logging
-from open_lm.params import parse_args
-from open_lm.scheduler import cosine_lr, const_lr
-from open_lm.train import train_one_epoch
-from open_lm.evaluate import evaluate_loop
-from open_lm.file_utils import (
-    pt_load,
-    check_exists,
-    start_sync_process,
-    remote_sync_with_expon_backoff,
-    get_metadata_file,
-    get_string_for_epoch,
-    log_num_checkpoints,
-    terminate_sync_process,
-)
-
-
-LATEST_CHECKPOINT_NAME = "epoch_latest.pt"
-
-
-def random_seed(seed=42, rank=0):
-    torch.manual_seed(seed + rank)
-    np.random.seed(seed + rank)
-    random.seed(seed + rank)
-
-
-def natural_key(string_):
-    """See http://www.codinghorror.com/blog/archives/001018.html"""
-    return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())]
-
-
-def get_latest_checkpoint(path: str):
-    is_s3 = path.startswith("s3")
-    fs, root_path = fsspec.core.url_to_fs(path)
-    checkpoints = fs.glob(os.path.join(root_path, "epoch_*.pt"))
-    if checkpoints:
-        checkpoints = sorted(checkpoints, key=natural_key)
-        return f"s3://{checkpoints[-1]}" if is_s3 else checkpoints[-1]
-
-    return None
-
-
-def get_state_dict(name):
-    checkpoint = pt_load(name, map_location="cpu")
-    if "epoch" in checkpoint:
-        sd = checkpoint["state_dict"]
-        if next(iter(sd.items()))[0].startswith("module"):
-            sd = {k[len("module.") :]: v for k, v in sd.items()}
-    else:
-        sd = checkpoint
-    return sd
-
-
-def load_model(args, model, different_seed=False):
-    checkpoint = pt_load(args.resume, map_location="cpu")
-    if "epoch" in checkpoint:
-        if not different_seed and "shard_shuffle_seed" in checkpoint:
-            pretrained_seed = checkpoint["shard_shuffle_seed"]
-            assert (
-                pretrained_seed == args.seed
-            ), f"This checkpoint was trained with a random seed of {pretrained_seed}. Since this seed affects shard shuffling, resuming training must use the same seed."
-        else:
-            if different_seed:
-                message = "Resuming a checkpoint without checking that the seed match. This means that training might not be reproducible."
-            else:
-                message = "Resuming a checkpoint that does not have a seed saved. This means that the shards were not shuffled, so they will remain unshuffled."
-            logging.info(message)
-            pretrained_seed = None
-
-        # resuming a train checkpoint w/ epoch and optimizer state
-        start_epoch = checkpoint["epoch"]
-        sd = checkpoint["state_dict"]
-        global_step = checkpoint.get("step", None)
-        if next(iter(sd.items()))[0].startswith("module"):
-            sd = {k[len("module.") :]: v for k, v in sd.items()}
-        if "_orig_mod" in next(iter(sd.items()))[0]:
-            sd = {k.replace("_orig_mod.", ""): v for k, v in sd.items()}
-        if args.fsdp:
-            model.load_state_dict(sd)
-        elif args.distributed:
-            model.module.load_state_dict(sd)
-        else:
-            model.load_state_dict(sd)
-        logging.info(f"=> resuming checkpoint '{args.resume}' (epoch {start_epoch})")
-    else:
-        # loading a bare (model only) checkpoint for fine-tune or evaluation
-        start_epoch, global_step = 0, 0
-        pretrained_seed = None
-        model.load_state_dict(checkpoint)
-        logging.info(f"=> loaded checkpoint '{args.resume}' (epoch {start_epoch})")
-    return start_epoch, global_step, pretrained_seed
-
-
-def load_avg_models(args, averagers):
-    checkpoint = pt_load(args.resume, map_location="cpu")
-    if "epoch" in checkpoint:
-        # resuming a train checkpoint w/ epoch and optimizer state
-        start_epoch = checkpoint["epoch"]
-        if averagers is not None:
-            for k in averagers.avgs_dict:
-                avg_sd = torch.load(args.resume.replace("epoch", k), map_location="cpu")
-                if next(iter(avg_sd.items()))[0].startswith("module"):
-                    avg_sd = {k[len("module.") :]: v for k, v in avg_sd.items()}
-                if "_orig_mod" in next(iter(avg_sd.items()))[0]:
-                    avg_sd = {k.replace("_orig_mod.", ""): v for k, v in avg_sd.items()}
-                averagers.avgs_dict[k].load_state_dict_avg(avg_sd)
-                logging.info(
-                    f"=> resuming averager for {k} from checkpoint '{args.resume.replace('epoch', k)} (epoch {start_epoch})"
-                )
-    return
-
-
-def load_optimizer(args, model, optimizer, scaler):
-    potential_checkpoint = args.resume.replace("epoch_", "optimizer_")
-    if check_exists(potential_checkpoint):
-        checkpoint = pt_load(potential_checkpoint, map_location="cpu")
-    else:
-        checkpoint = pt_load(args.resume, map_location="cpu")
-    if "optimizer" in checkpoint:
-        if optimizer is not None:
-            osd = checkpoint["optimizer"]
-            if args.fsdp:
-                osd = FSDP.optim_state_dict_to_load(model=model, optim=optimizer, optim_state_dict=osd)
-            optimizer.load_state_dict(osd)
-            logging.info(f"=> resuming optimizer")
-        if scaler is not None and "scaler" in checkpoint:
-            scaler.load_state_dict(checkpoint["scaler"])
-    else:
-        logging.info(f"=> WARNING: not resuming optimizer.")
-
-
-def load_data_chunks(args):
-    checkpoint = pt_load(args.resume, map_location="cpu")
-    if "next_shard_per_source" in checkpoint and "samples_seen" in checkpoint:
-        return checkpoint["next_shard_per_source"], checkpoint["samples_seen"]
-    else:
-        logging.info(
-            "=> WARNING: tried to resume a checkpoint without data loading info. Re-starting data loading from the "
-            "first shard."
-        )
-        return [0 for _ in range(len(args.dataset_manifest))], 0
-
-
-def save_checkpoint(
-    args,
-    model,
-    optimizer,
-    scaler,
-    completed_epoch,
-    evaluation_metrics,
-    step,
-    is_final_checkpoint,
-    percentage_of_data_seen=-1.0,
-    next_shard_per_source=None,
-    samples_seen=None,
-    shard_shuffle_seed=None,
-    train_data_string=None,
-    averagers=None,
-    failed=False,
-):
-    cpu_state, optim_state = None, None
-    if args.logs and args.logs.lower() != "none" and args.fsdp:
-        save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
-        with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy):
-            cpu_state = model.state_dict()
-            optim_state = FSDP.optim_state_dict(model, optimizer)
-    if args.save_logs:
-        checkpoint_dict_model = {
-            "epoch": completed_epoch,
-            "name": args.name,
-            "state_dict": cpu_state if args.fsdp else model.state_dict(),
-            "evaluation_metrics": evaluation_metrics,
-        }
-        if next_shard_per_source is not None:
-            checkpoint_dict_model["next_shard_per_source"] = next_shard_per_source
-
-        if samples_seen is not None:
-            checkpoint_dict_model["samples_seen"] = samples_seen
-
-        if step is not None:
-            checkpoint_dict_model["step"] = step
-
-        if shard_shuffle_seed is not None:
-            checkpoint_dict_model["shard_shuffle_seed"] = shard_shuffle_seed
-
-        checkpoint_dict_opt = {
-            "epoch": completed_epoch,
-            "name": args.name,
-            "optimizer": optim_state if args.fsdp else optimizer.state_dict(),
-            "evaluation_metrics": evaluation_metrics,
-        }
-
-        if scaler is not None:
-            checkpoint_dict_opt["scaler"] = scaler.state_dict()
-
-        checkpoint_dict_stats = {
-            "epoch": completed_epoch,
-            "name": args.name,
-            "is_final_checkpoint": is_final_checkpoint,
-            "evaluation_metrics": evaluation_metrics,
-            "percentage_of_data_seen": percentage_of_data_seen,
-        }
-        if next_shard_per_source is not None:
-            checkpoint_dict_stats["next_shard_per_source"] = next_shard_per_source
-
-        if samples_seen is not None:
-            checkpoint_dict_stats["samples_seen"] = samples_seen
-
-        if step is not None:
-            checkpoint_dict_stats["step"] = step
-
-        if shard_shuffle_seed is not None:
-            checkpoint_dict_stats["shard_shuffle_seed"] = shard_shuffle_seed
-
-        if train_data_string is not None:
-            checkpoint_dict_stats["train_data_string"] = train_data_string
-
-        prefixes = {
-            "epoch_": checkpoint_dict_model,
-            "optimizer_": checkpoint_dict_opt,
-            "stats_": checkpoint_dict_stats,
-        }
-
-        if averagers is not None:
-            for k in averagers.avgs_dict:
-                prefixes[f"{k}_"] = averagers.avgs_dict[k].get_state_dict_avg()
-        if (
-            completed_epoch == args.epochs
-            or is_final_checkpoint
-            or (args.save_frequency > 0 and (completed_epoch % args.save_frequency) == 0)
-        ):
-            for prefix in prefixes:
-                save_path = args.checkpoint_path if not failed else args.failed_checkpoint_path
-                path = os.path.join(save_path, f"{prefix}{completed_epoch}.pt")
-                print(f"Saving {prefix}{completed_epoch} in {path}...")
-                torch.save(
-                    prefixes[prefix],
-                    path,
-                )
-
-        if args.delete_previous_checkpoint:
-            for prefix in prefixes:
-                prev = os.path.join(args.checkpoint_path, f"{prefix}{completed_epoch - 1}.pt")
-                if os.path.exists(prev):
-                    os.remove(prev)
-
-
-def cleanup(sync_process, distributed=False):
-    if sync_process:
-        terminate_sync_process(sync_process)
-    if distributed and torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
-
-
-def main(args):
-    args = parse_args(args)
-
-    requires_training = args.train_data or args.dataset_type == "synthetic" or args.dataset_manifest is not None
-
-    if torch.cuda.is_available():
-        # This enables tf32 on Ampere GPUs which is only 8% slower than
-        # float16 and almost as accurate as float32
-        # This was a default in pytorch until 1.12
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.benchmark = True
-        torch.backends.cudnn.deterministic = False
-
-    # fully initialize distributed device environment
-    device = init_distributed_device(args)
-
-    assert (
-        args.global_batch_size % args.world_size == 0
-    ), f"Global batch size ({args.global_batch_size}) is not divisible by number of GPUs ({args.world_size}), and thus cannot be respected."
-
-    args.per_gpu_batch_size = max(args.global_batch_size // args.world_size, 1)
-    if args.val_data is not None:
-        args.per_gpu_val_batch_size = max(args.global_val_batch_size // args.world_size, 1)
-
-    if args.hf_model is not None and args.hf_seq_len is None:
-        raise ValueError("If passing --hf-model, must also pass --hf-seq-len to be used for training/fine-tuning.")
-
-    if args.hf_model is not None and args.fsdp and args.hf_fsdp_block is None:
-        raise ValueError("If passing --hf-model and --fsdp, must also pass --hf-fspd-block.")
-
-    if args.fsdp and not args.distributed:
-        raise ValueError(f"--fsdp can only be specified in distributed mode.")
-
-    # get the name of the experiments
-    if args.name is None:
-        # sanitize model name for filesystem / uri use, easier if we don't use / in name as a rule?
-        model_name_safe = None
-        if args.hf_model is not None:
-            model_name_safe = args.hf_model.replace("/", "-")
-        else:
-            if Path(args.model).is_file():
-                model_name_safe = Path(args.model).stem.replace("/", "-")
-            else:
-                model_name_safe = args.model.replace("/", "-")
-
-        date_str = datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
-        if args.distributed:
-            # sync date_str from master to all ranks
-            date_str = broadcast_object(args, date_str)
-        args.name = "-".join(
-            [
-                date_str,
-                f"model_{model_name_safe}",
-                f"lr_{args.lr}",
-                f"b_{args.per_gpu_batch_size}",  # Per gpu to respect old naming convention
-            ]
-        )
-
-    resume_latest = args.resume == "latest"
-    log_base_path = os.path.join(args.logs, args.name)
-    args.log_path = None
-    if is_master(args, local=args.log_local):
-        os.makedirs(log_base_path, exist_ok=True)
-        log_filename = f"out-{args.rank}" if args.log_local else "out.log"
-        args.log_path = os.path.join(log_base_path, log_filename)
-        if os.path.exists(args.log_path) and not resume_latest:
-            raise ValueError(f"Experiment {args.log_path} already exists. Use --name to specify a new experiment.")
-
-    # Setup text logger
-    args.log_level = logging.DEBUG if args.debug else logging.INFO
-    setup_logging(args.log_path, args.log_level)
-
-    # Setup wandb, tensorboard, checkpoint logging
-    args.wandb = "wandb" in args.report_to or "all" in args.report_to
-    args.tensorboard = "tensorboard" in args.report_to or "all" in args.report_to
-    args.checkpoint_path = os.path.join(log_base_path, "checkpoints")
-    args.failed_checkpoint_path = os.path.join(log_base_path, "checkpoints_failed")
-    if is_master(args):
-        args.tensorboard_path = os.path.join(log_base_path, "tensorboard") if args.tensorboard else ""
-        for dirname in [args.tensorboard_path, args.checkpoint_path, args.failed_checkpoint_path]:
-            if dirname:
-                os.makedirs(dirname, exist_ok=True)
-    else:
-        args.tensorboard_path = ""
-
-    if resume_latest:
-        resume_from = None
-        checkpoint_path = args.checkpoint_path
-
-        # If using remote_sync, need to check the remote instead of the local checkpoints folder.
-        if args.remote_sync is not None:
-            checkpoint_path = os.path.join(args.remote_sync, args.name, "checkpoints")
-
-        if is_master(args):
-            # Checking for existing checkpoint via master rank only. It is possible for
-            # different rank processes to see different files if a shared file-system is under
-            # stress, however it's very difficult to fully work around such situations.
-            if args.save_most_recent:
-                # if --save-most-recent flag is set, look for latest at a fixed filename
-                resume_from = os.path.join(checkpoint_path, "checkpoints", LATEST_CHECKPOINT_NAME)
-                if not os.path.exists(resume_from):
-                    # If no latest checkpoint has been saved yet, don't try to resume
-                    resume_from = None
-            else:
-                # otherwise, list checkpoint dir contents and pick the newest checkpoint
-                resume_from = get_latest_checkpoint(checkpoint_path)
-            if resume_from:
-                logging.info(f"Found latest resume checkpoint at {resume_from}.")
-            else:
-                logging.info(f"No latest resume checkpoint found in {checkpoint_path}.")
-        if args.distributed:
-            # sync found checkpoint path to all ranks
-            resume_from = broadcast_object(args, resume_from)
-        args.resume = resume_from
-
-    if args.copy_codebase:
-        copy_codebase(args)
-
-    # start the sync proces if remote-sync is not None
-    remote_sync_process = None
-    if is_master(args) and args.remote_sync is not None:
-        # first make sure it works
-        result = remote_sync_with_expon_backoff(
-            args.remote_sync_frequency,
-            os.path.join(args.logs, args.name),
-            os.path.join(args.remote_sync, args.name),
-            args.remote_sync_protocol,
-        )
-        if result:
-            logging.info("remote sync successful.")
-        else:
-            raise ValueError("Remote sync failed.")
-        # if all looks good, start a process to do this every args.remote_sync_frequency seconds
-        remote_sync_process = start_sync_process(
-            args.remote_sync_frequency,
-            os.path.join(args.logs, args.name),
-            os.path.join(args.remote_sync, args.name),
-            args.remote_sync_protocol,
-        )
-        remote_sync_process.start()
-
-    # Handle cleanup even if open_lm crashes.
-    # TODO: For cases where main() is called as a functio, we need to call cleanup() manually.
-    # Right now, we do this manually in every case where main returns, but we should put main() in a wrapper and call
-    # cleanup() outside it, ideally.
-    atexit.register(cleanup, sync_process=remote_sync_process, distributed=args.distributed)
-
-    if args.precision == "fp16":
-        logging.warning(
-            "It is recommended to use AMP mixed-precision instead of FP16. "
-            "FP16 support needs further verification and tuning, especially for train."
-        )
-
-    elif args.distributed:
-        logging.info(
-            f"Running in distributed mode with multiple processes. Device: {args.device}."
-            f"Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}."
-        )
-    else:
-        logging.info(f"Running with a single process. Device {args.device}.")
-
-    random_seed(args.seed, 0)
-
-    model = None
-    if args.hf_model is not None:
-        model = create_wrapped_hf_model(args)
-    else:
-        # Optional: Use meta device
-        with torch.device("meta" if args.experimental_meta_device and args.fsdp else args.device):
-            if args.classification:
-                model = create_classif_model(args)
-            else:    
-                model = create_model(args)
-
-    args.vocab_size = model.vocab_size
-    args.seq_len = model.seq_len
-    if args.train_num_samples is not None:
-        args.train_num_samples //= args.seq_len
-    if args.val_num_samples is not None:
-        if args.val_num_samples // args.seq_len == 0:
-            raise ValueError(
-                f"number of requested evaluation val_num_samples (tokens): {args.val_num_samples} is less than seq_len: {args.seq_len}"
-            )
-        args.val_num_samples //= args.seq_len
-
-    averagers = None
-    random_seed(args.seed, args.rank)
-
-    if args.grad_checkpointing:
-        model.set_grad_checkpointing()
-
-    if args.distributed:
-        if args.fsdp:
-            transformer_layer_cls = None
-
-            if args.hf_model is not None:
-                # retrive the user specified block class for fsdp
-                for _, target_cls in model.named_modules():
-                    if args.hf_fsdp_block in type(target_cls).__name__:
-                        transformer_layer_cls = {type(target_cls)}
-                        break
-
-                if transformer_layer_cls is None:
-                    print(f"--hf-fsdp-block {args.hf_fsdp_block} not found in --hf-model {args.hf_model}")
-                    return -1
-
-            else:
-                transformer_layer_cls = {Block}
-            # from https://pytorch.org/blog/efficient-large-scale-training-with-pytorch/
-            transformer_auto_wrapper_policy = functools.partial(
-                transformer_auto_wrap_policy,
-                transformer_layer_cls=transformer_layer_cls,
-            )
-            # tries to follow gopher...
-            mp_policy = None
-            if args.fsdp_amp:
-                print("=> using bfloat16 params as part of fsdp amp policy.")
-                mp_policy = MixedPrecision(
-                    param_dtype=torch.bfloat16,
-                    reduce_dtype=torch.float32,
-                    buffer_dtype=torch.bfloat16,
-                )
-            elif args.fsdp_pure_bf16:
-                print("=> using pure bfloat16 params as part of fsdp amp policy.")
-                mp_policy = MixedPrecision(
-                    param_dtype=torch.bfloat16,
-                    reduce_dtype=torch.bfloat16,
-                    buffer_dtype=torch.bfloat16,
-                )
-
-            if args.rank == 0:
-                print(f"Before FSDP parameter num: {sum(p.numel() for p in model.parameters()):,}")
-                print(f"Before FSDP {torch.cuda.memory_allocated()/1024**3:.3} GB")
-
-            fsdp_kwargs = {}
-            assert not (
-                args.fsdp_hybrid and args.fsdp_hybrid_o2
-            ), "Only --fsdp-hybrid or --fsdp-hybrid-o2 should be set."
-            if args.fsdp_backward_prefetch:
-                fsdp_kwargs["backward_prefetch"] = BackwardPrefetch.BACKWARD_PRE
-            if args.fsdp_hybrid:
-                fsdp_kwargs["sharding_strategy"] = ShardingStrategy.HYBRID_SHARD
-            if args.fsdp_hybrid_o2:
-                fsdp_kwargs["sharding_strategy"] = ShardingStrategy._HYBRID_SHARD_ZERO2
-            print("=> FSDP kwargs: ", fsdp_kwargs)
-
-            # Initialize FSDP. Use the same seed across workers to ensure reset_parameters is the same across workers.
-            random_seed(args.seed, rank=0)
-            model = FSDP(
-                model,
-                auto_wrap_policy=transformer_auto_wrapper_policy,
-                device_id=device,
-                mixed_precision=mp_policy,
-                cpu_offload=CPUOffload(offload_params=args.fsdp_cpu_offload),
-                use_orig_params=args.fsdp_use_orig_params,
-                limit_all_gathers=args.fsdp_limit_all_gathers,
-                **fsdp_kwargs,
-            )
-
-            print(f"After FSDP parameter num: {sum(p.numel() for p in model.parameters()):,} on rank {args.rank}")
-            print(f"After FSDP {torch.cuda.memory_allocated()/1024**3:.3} GB on rank {args.rank}")
-        else:
-            ddp_args = {}
-            if args.ddp_static_graph:
-                # this doesn't exist in older PyTorch, arg only added if enabled
-                ddp_args["static_graph"] = True
-            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], **ddp_args)
-    if args.averagers is not None:
-        averagers = ModelAverager(model, args.averagers)
-    if args.resume is not None and averagers is not None:
-        load_avg_models(args, averagers)
-
-    if is_master(args):
-        logging.info(f"Model (has {sum(p.numel() for p in model.parameters() if p.requires_grad)} parameters):")
-        logging.info(f"{str(model)}")
-        logging.info("Params:")
-        params_file = os.path.join(args.logs, args.name, "params.txt")
-        with open(params_file, "w") as f:
-            for name in sorted(vars(args)):
-                val = getattr(args, name)
-                logging.info(f"  {name}: {val}")
-                f.write(f"{name}: {val}\n")
-
-    # optionally resume model from a checkpoint
-    start_epoch, global_step = 0, 0
-    shard_shuffle_seed = args.seed
-    if args.resume is not None:
-        start_epoch, global_step, shard_shuffle_seed = load_model(args, model)
-
-    elif args.pretrained is not None:
-        print("=> loading from a pre-trained model.")
-        args.resume = args.pretrained
-        # this flag continues training from the pre-trained model.
-        if args.load_pretrained_state:
-            start_epoch, global_step, shard_shuffle_seed = load_model(args, model)
-        else:
-            load_model(args, model, different_seed=True)
-            args.resume = None
-    elif args.average is not None:
-        num_models_to_average = len(args.average)
-        print(
-            "=> Averaging models: ",
-            args.average,
-            " with coefficients: ",
-            args.average_coefficients,
-        )
-        assert num_models_to_average > 1, "num_models_to_average must be > 1 - else use --pretrained"
-        if args.average_coefficients is None:
-            args.average_coefficients = [1.0 / num_models_to_average] * num_models_to_average
-        else:
-            assert len(args.average_coefficients) == num_models_to_average
-        state_dict = {k: v * args.average_coefficients[0] for k, v in get_state_dict(args.average[0]).items()}
-        for i in range(1, num_models_to_average):
-            state_dict_i = get_state_dict(args.average[i])
-            for k in state_dict:
-                state_dict[k] = state_dict[k] + state_dict_i[k] * args.average_coefficients[i]
-        model.load_state_dict(state_dict)
-
-    # Put the shard shuffle seed back into args (this is done for compatibility with older, non shuffling versions)
-    args.shard_shuffle_seed = shard_shuffle_seed
-
-    if requires_training and global_step is None:
-        raise ValueError("Key 'step' not found in checkpoint, but required for training.")
-
-    # Add data chunk when resuming (only for dataset without resampling)
-    next_shard_per_source = [0 for _ in range(len(args.dataset_manifest))] if args.dataset_manifest is not None else 0
-    samples_seen = 0
-    if args.resume is not None and args.dataset_manifest is not None:
-        next_shard_per_source, samples_seen = load_data_chunks(args)
-        if samples_seen >= args.train_num_samples * args.epochs:
-            raise RuntimeError("Loaded a checkpoint which has already seen the desired number of tokens.")
-
-    # create optimizer and scaler
-    optimizer = None
-    scaler = None
-
-    if requires_training:
-        named_parameters = list(model.named_parameters())
-        no_decay_params = []  # to be potentially used later
-        params = [p for n, p in named_parameters if p.requires_grad]
-
-        optimizer = optim.AdamW(
-            [
-                {"params": no_decay_params, "weight_decay": 0.0},
-                {"params": params, "weight_decay": args.wd},
-            ],
-            lr=args.lr,
-            betas=(args.beta1, args.beta2),
-            eps=args.eps,
-        )
-        scaler = None
-        if args.precision == "amp":
-            assert not args.fsdp, "FSDP not supported with amp, only amp_bfloat16"
-            scaler = GradScaler()
-
-    # initialize datasets
-    # use tokenizer=None because the data is already pre-tokenized.
-
-    data = get_data(
-        args,
-        epoch=start_epoch,
-        tokenizer=None,
-        skip_train=args.dataset_manifest is not None,
-        floor=args.dataset_manifest is not None,
-    )
-
-    if args.target_mask_left is not None:
-        # tokens handled with same modulo in dataloading
-        args.target_mask_left = proc_token(args.target_mask_left, args.vocab_size)
-
-    if args.target_mask_individual is not None:
-        # tokens handled with same modulo in dataloading
-        args.target_mask_individual = proc_token(args.target_mask_individual, args.vocab_size)
-
-    if args.torchcompile:
-        logging.info("Compiling model...")
-        model = torch.compile(model)
-        if averagers is not None:
-            logging.info("Compiling averagers...")
-            for k in averagers.avgs_dict:
-                averagers.avgs_dict[k].av_model = torch.compile(averagers.avgs_dict[k].av_model)
-
-    # optionally resume optimizer from a checkpoint
-    # this needs to be after torchcompile
-    if args.resume is not None:
-        load_optimizer(args, model, optimizer, scaler)
-
-    # create scheduler if train
-    scheduler = None
-    if requires_training:
-        if args.dataset_manifest is not None:
-            total_steps = (args.train_num_samples * args.epochs) // args.global_batch_size
-        else:
-            total_steps = (data["train"].dataloader.num_batches) * args.epochs
-
-        if args.lr_scheduler == "cosine":
-            scheduler = cosine_lr(
-                optimizer,
-                args.lr,
-                args.warmup,
-                total_steps,
-                args.lr_cooldown_end,
-                args.force_min_lr,
-            )
-        elif args.lr_scheduler == "const":
-            scheduler = const_lr(
-                optimizer,
-                args.lr,
-                args.warmup,
-                # total_steps,
-                # args.lr_cooldown_end,
-                # args.force_min_lr,
-            )
-        else:
-            raise ValueError(f"Unknown scheduler, {args.lr_scheduler}. Available options are: cosine, const.")
-
-    # determine if this worker should save logs and checkpoints. only do so if it is rank == 0
-    args.save_logs = args.logs and args.logs.lower() != "none" and is_master(args)
-    writer = None
-    if args.save_logs and args.tensorboard:
-        assert tensorboard is not None, "Please install tensorboard."
-        writer = tensorboard.SummaryWriter(args.tensorboard_path)
-    if args.wandb and is_master(args):
-        assert wandb is not None, "Please install wandb."
-        logging.debug("Starting wandb.")
-
-        wandb.init(
-            project=args.wandb_project_name,
-            name=args.name,
-            notes=args.wandb_notes,
-            tags=[],
-            resume=None,
-            config=vars(args),
-        )
-        if args.debug:
-            wandb.watch(model, log="all")
-        wandb.save(params_file)
-        logging.debug("Finished loading wandb.")
-
-    if not requires_training:
-        if not args.resume:
-            logging.info("No training required, exiting.")
-            cleanup(remote_sync_process, args.distributed)
-            return
-        logging.info("No training required, evaluating instead.")
-        checkpoint_root = os.path.dirname(args.resume)
-
-        if averagers is not None:
-            k = next(iter(averagers.avgs_dict.keys()))
-            logging.info(f"=> evaluation avg {k}")
-            model = averagers.avgs_dict[k].av_model
-        metrics = evaluate_loop(model, data["val_list"], start_epoch, args, writer)
-        metrics["average"] = k if averagers is not None else "none"
-
-        if is_master(args):
-            with fsspec.open(os.path.join(checkpoint_root, "results.jsonl"), "a") as f:
-                f.write(f"{json.dumps(metrics)}\n")
-
-        cleanup(remote_sync_process, args.distributed)
-        return
-
-    loss = torch.nn.CrossEntropyLoss()
-    if args.z_loss_coefficient != 0.0:
-        if is_master(args):
-            logging.info("Using CrossEntropyLossWithZLoss.")
-        loss = CrossEntropyLossWithZLoss(args.z_loss_coefficient)
-
-    if args.dataset_manifest:
-        log_num_checkpoints(total_steps, args)
-
-    # Only enter training loop if there are steps to be done.
-    done_training = global_step >= total_steps
-    epoch = start_epoch
-    num_ckpt_too_few_tokens = 0
-    while not done_training:
-        if is_master(args):
-            logging.info(f"Start epoch {epoch}")
-
-        if args.dataset_manifest is not None:
-            assert not args.dataset_resampled, "dataset_manifest and dataset_resampled are mutually exclusive"
-            (
-                train_data_string_per_source,
-                num_samples_per_source,
-                next_shard_per_source,
-            ) = get_string_for_epoch(
-                args.train_num_samples,
-                next_shard_per_source,
-                args.dataset_manifest,
-                args.train_data_mix_weights,
-                args.workers,
-                args.world_size,
-                multi_epoch=args.multiple_data_passes,
-                shard_shuffle_seed=args.shard_shuffle_seed,
-            )
-
-            # In the distributed case, make sure that all nodes receive the same string
-            if args.distributed:
-                all_source_strings = ["" for _ in range(args.world_size)]
-                dist.all_gather_object(all_source_strings, train_data_string_per_source)
-                assert all(
-                    [x == train_data_string_per_source for x in all_source_strings]
-                ), "Dataset to train on is not the same across all nodes. This should not happen normally, unless there is an issue with shard shuffling during the dataset generation."
-
-            if data["train"] is not None:
-                del data["train"]
-            args.train_data = train_data_string_per_source
-
-            # Draw num_samples_per_source at most from dataset - rounded down to guarantee uniqueness.
-            data["train"] = get_wds_dataset(
-                args, True, epoch, force_num_samples=num_samples_per_source, data_key=args.data_key, floor=True
-            )
-
-        prev_step = global_step
-        if is_master(args):
-            logging.info(f"=> epoch {epoch}, training on {args.train_data}")
-
-        if args.distributed:
-            dist.barrier()
-            
-        
-        #for batch in data["train"].dataloader:
-        #    (texts, labels) = batch
-        #    print(labels)
-        
-        # Get the dataloader and create an iterator
-        #dataloader = data["train"].dataloader
-        #data_iterator = iter(dataloader)
-        #batch = next(data_iterator)
-
-        #(texts, labels) = batch
-        
-        #texts = torch.LongTensor(texts).to('cuda:0')
-        #labels = torch.LongTensor(labels).to('cuda:0')
-        #print(labels, labels.size())
-        #labels = labels.unsqueeze(1).repeat(1, args.seq_len)
-        #print(labels, labels.size())
-        
-        
-        #print(len(texts), texts.dtype, texts[0].size())
-        #print(len(labels), labels.dtype, labels.size())
-        #print(labels)
-        
-        
-        #print(len(labels), labels.dtype)
-        
-        #print("len(texts)= ", len(texts), " size(texts[0])= ", texts[0].size())
-        #print(type(texts), type(texts[0]))
-        
-        #inputs, targets = sample_chunk(texts, args)
-        
-        #print("len(inputs)= ", len(inputs), " size(inputs[0])= ", inputs[0].size())
-        #print(type(inputs), type(inputs[0]))
-        
-        #print("len(targets)= ", len(targets), " size(targets)= ", targets[0].size())
-        #print(type(targets), type(targets[0]))
-        
-        #print("texts[0]= ",  texts[0])
-        #print("inputs[0]= ",  inputs[0])
-        #print("targets[0]= ",   targets[0])
-        
-        #out, _, _ = model(inputs)
-        
-        #print("len(out)= ", len(out), " size(out)= ", out.size())
-        #print(type(out), type(out[0]))     
-        #print("out[0]= ",   out[0])
-        
-        #device = next(model.parameters()).device
-        #print(inputs.device,   device)
-        
-        #print("reshape")
-        #print("out reshaped: ", out.reshape(-1, args.vocab_size).size(), "targets reshaped: ", targets.reshape(-1).size())
-        #print(targets.dtype)
-        #print(targets)
-        
-        #out = out[:, -1, :]
-        #print("out reshaped: ", out.reshape(-1, args.num_classes).size(), "lables reshaped: ", labels.reshape(-1).size())
-
-        success, global_step = train_one_epoch(
-            model,
-            data,
-            loss,
-            averagers=averagers,
-            epoch=epoch,
-            step=global_step,
-            optimizer=optimizer,
-            scaler=scaler,
-            scheduler=scheduler,
-            total_steps=total_steps,
-            args=args,
-            tb_writer=writer,
-        )
-
-        if args.distributed:
-            dist.barrier()
-
-        done_training = global_step >= total_steps
-        steps_done_epoch = global_step - prev_step
-        samples_seen = samples_seen + steps_done_epoch * args.global_batch_size
-
-        if not success:
-            logging.info("Training exiting due to NaN value")
-            break
-
-        failed_ckpt = False
-        expected_steps = data["train"].dataloader.num_batches
-        if steps_done_epoch < (1 - args.data_tolerate_error_p) * expected_steps and not done_training:
-            failed_ckpt = True
-            num_ckpt_too_few_tokens += 1
-            if is_master(args):
-                logging.warning(
-                    f"Epoch {epoch}, tokens seen: {steps_done_epoch * args.global_batch_size * args.seq_len}, tokens expected: {expected_steps * args.global_batch_size * args.seq_len}, ratio: {steps_done_epoch / expected_steps}"
-                )
-
-        epoch = epoch + 1
-        evaluation_metrics = []
-        if "val_list" in data and (epoch % args.val_frequency == 0 or done_training):
-            # validate based on frequency and always validate the last checkpoint
-            try:
-                evaluation_metrics = evaluate_loop(model, data["val_list"], epoch, args, writer)
-
-                if is_master(args):
-                    with fsspec.open(os.path.join(args.checkpoint_path, "results.jsonl"), "a") as f:
-                        f.write(f"{json.dumps(evaluation_metrics)}\n")
-
-            except Exception as e:
-                if is_master(args):
-                    logging.error(e)
-                    logging.error(traceback.format_exc())
-                    logging.warning("evaluation failed! continuing to save_checkpoint")
-
-        if is_master(args):
-            end_of_epoch_log = {
-                "epoch": epoch,
-                "tokens": (global_step + 1) * args.global_batch_size * args.seq_len,
-                "checkpoints_too_few_tokens": num_ckpt_too_few_tokens,
-                "percentage_of_data_seen": steps_done_epoch / expected_steps,
-            }
-
-            if args.dataset_manifest is not None:
-                for i in range(len(next_shard_per_source)):
-                    end_of_epoch_log[f"next_shard_{i}"] = next_shard_per_source[i]
-                    end_of_epoch_log[f"dataset_pass_{i}"] = next_shard_per_source[i] // len(
-                        get_metadata_file(args.dataset_manifest[i])
-                    )
-
-            for name, val in end_of_epoch_log.items():
-                name = "train/" + name
-                if writer is not None:
-                    writer.add_scalar(name, val, global_step)
-                if args.wandb:
-                    assert wandb is not None, "Please install wandb."
-                    wandb.log({name: val, "step": global_step, "tokens": end_of_epoch_log["tokens"]})
-
-        # Saving checkpoints.
-        save_checkpoint(
-            args,
-            model,
-            optimizer,
-            scaler,
-            epoch,
-            evaluation_metrics,
-            step=global_step,
-            is_final_checkpoint=done_training,
-            percentage_of_data_seen=1.0 * steps_done_epoch / expected_steps,
-            next_shard_per_source=next_shard_per_source if args.dataset_manifest is not None else None,
-            samples_seen=samples_seen if args.dataset_manifest is not None else None,
-            shard_shuffle_seed=args.shard_shuffle_seed,
-            train_data_string=train_data_string_per_source if args.dataset_manifest is not None else None,
-            averagers=averagers,
-            failed=failed_ckpt,
-        )
-
-        if num_ckpt_too_few_tokens > args.data_tolerate_num_ckpts:
-            raise RuntimeError(
-                f"{num_ckpt_too_few_tokens} checkpoints happened where the number of tokens seen was {1 - args.data_tolerate_error_p} of expected. This is likely due to transient errors e.g. reading from S3."
-            )
-
-        if done_training:
-            if is_master(args):
-                logging.info("Model has seen the desired number of tokens. Ending training.")
-            break
-
-    if args.wandb and is_master(args):
-        wandb.finish()
-
-    # run a final sync.
-    if remote_sync_process is not None:
-        logging.info("Final remote sync.")
-        terminate_sync_process(remote_sync_process)
-        result = remote_sync_with_expon_backoff(
-            args.remote_sync_frequency,
-            os.path.join(args.logs, args.name),
-            os.path.join(args.remote_sync, args.name),
-            args.remote_sync_protocol,
-        )
-        if result:
-            logging.info("Final remote sync successful.")
-        else:
-            logging.info("Final remote sync failed.")
-
-    # Final sync of all procs.
-    if args.distributed:
-        dist.barrier()
-
-    cleanup(remote_sync_process, args.distributed)
-    return args
-
-
-def copy_codebase(args):
-    from shutil import copytree, ignore_patterns
-
-    new_code_path = os.path.join(args.logs, args.name, "code")
-    if os.path.exists(new_code_path):
-        print(f"Error. Experiment already exists at {new_code_path}. Use --name to specify a new experiment.")
-        return -1
-    print(f"Copying codebase to {new_code_path}")
-    current_code_path = os.path.realpath(__file__)
-    for _ in range(3):
-        current_code_path = os.path.dirname(current_code_path)
-    copytree(current_code_path, new_code_path, ignore=ignore_patterns("log", "logs", "wandb"))
-    print("Done copying code.")
-    return 1
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/open_lm/manifest.jsonl b/open_lm/manifest.jsonl
deleted file mode 100644
index 1e4cc33..0000000
--- a/open_lm/manifest.jsonl
+++ /dev/null
@@ -1,200 +0,0 @@
-{"shard": "z", "num_sequences": 8192}
-{"shard": "shard-0000001", "num_sequences": 8192}
-{"shard": "shard-0000002", "num_sequences": 8192}
-{"shard": "shard-0000003", "num_sequences": 8192}
-{"shard": "shard-0000004", "num_sequences": 8192}
-{"shard": "shard-0000005", "num_sequences": 8192}
-{"shard": "shard-0000006", "num_sequences": 8192}
-{"shard": "shard-0000007", "num_sequences": 8192}
-{"shard": "shard-0000008", "num_sequences": 8192}
-{"shard": "shard-0000009", "num_sequences": 8192}
-{"shard": "shard-0000010", "num_sequences": 8192}
-{"shard": "shard-0000011", "num_sequences": 8192}
-{"shard": "shard-0000012", "num_sequences": 8192}
-{"shard": "shard-0000013", "num_sequences": 8192}
-{"shard": "shard-0000014", "num_sequences": 8192}
-{"shard": "shard-0000015", "num_sequences": 8192}
-{"shard": "shard-0000016", "num_sequences": 8192}
-{"shard": "shard-0000017", "num_sequences": 8192}
-{"shard": "shard-0000018", "num_sequences": 8192}
-{"shard": "shard-0000019", "num_sequences": 8192}
-{"shard": "shard-0000020", "num_sequences": 8192}
-{"shard": "shard-0000021", "num_sequences": 8192}
-{"shard": "shard-0000022", "num_sequences": 8192}
-{"shard": "shard-0000023", "num_sequences": 8192}
-{"shard": "shard-0000024", "num_sequences": 8192}
-{"shard": "shard-0000025", "num_sequences": 8192}
-{"shard": "shard-0000026", "num_sequences": 8192}
-{"shard": "shard-0000027", "num_sequences": 8192}
-{"shard": "shard-0000028", "num_sequences": 8192}
-{"shard": "shard-0000029", "num_sequences": 8192}
-{"shard": "shard-0000030", "num_sequences": 8192}
-{"shard": "shard-0000031", "num_sequences": 8192}
-{"shard": "shard-0000032", "num_sequences": 8192}
-{"shard": "shard-0000033", "num_sequences": 8192}
-{"shard": "shard-0000034", "num_sequences": 8192}
-{"shard": "shard-0000035", "num_sequences": 8192}
-{"shard": "shard-0000036", "num_sequences": 8192}
-{"shard": "shard-0000037", "num_sequences": 8192}
-{"shard": "shard-0000038", "num_sequences": 8192}
-{"shard": "shard-0000039", "num_sequences": 8192}
-{"shard": "shard-0000040", "num_sequences": 8192}
-{"shard": "shard-0000041", "num_sequences": 8192}
-{"shard": "shard-0000042", "num_sequences": 8192}
-{"shard": "shard-0000043", "num_sequences": 8192}
-{"shard": "shard-0000044", "num_sequences": 8192}
-{"shard": "shard-0000045", "num_sequences": 8192}
-{"shard": "shard-0000046", "num_sequences": 8192}
-{"shard": "shard-0000047", "num_sequences": 8192}
-{"shard": "shard-0000048", "num_sequences": 8192}
-{"shard": "shard-0000049", "num_sequences": 8192}
-{"shard": "shard-0000050", "num_sequences": 8192}
-{"shard": "shard-0000051", "num_sequences": 8192}
-{"shard": "shard-0000052", "num_sequences": 8192}
-{"shard": "shard-0000053", "num_sequences": 8192}
-{"shard": "shard-0000054", "num_sequences": 8192}
-{"shard": "shard-0000055", "num_sequences": 8192}
-{"shard": "shard-0000056", "num_sequences": 8192}
-{"shard": "shard-0000057", "num_sequences": 8192}
-{"shard": "shard-0000058", "num_sequences": 8192}
-{"shard": "shard-0000059", "num_sequences": 8192}
-{"shard": "shard-0000060", "num_sequences": 8192}
-{"shard": "shard-0000061", "num_sequences": 8192}
-{"shard": "shard-0000062", "num_sequences": 8192}
-{"shard": "shard-0000063", "num_sequences": 8192}
-{"shard": "shard-0000064", "num_sequences": 8192}
-{"shard": "shard-0000065", "num_sequences": 8192}
-{"shard": "shard-0000066", "num_sequences": 8192}
-{"shard": "shard-0000067", "num_sequences": 8192}
-{"shard": "shard-0000068", "num_sequences": 8192}
-{"shard": "shard-0000069", "num_sequences": 8192}
-{"shard": "shard-0000070", "num_sequences": 8192}
-{"shard": "shard-0000071", "num_sequences": 8192}
-{"shard": "shard-0000072", "num_sequences": 8192}
-{"shard": "shard-0000073", "num_sequences": 8192}
-{"shard": "shard-0000074", "num_sequences": 8192}
-{"shard": "shard-0000075", "num_sequences": 8192}
-{"shard": "shard-0000076", "num_sequences": 8192}
-{"shard": "shard-0000077", "num_sequences": 8192}
-{"shard": "shard-0000078", "num_sequences": 8192}
-{"shard": "shard-0000079", "num_sequences": 8192}
-{"shard": "shard-0000080", "num_sequences": 8192}
-{"shard": "shard-0000081", "num_sequences": 8192}
-{"shard": "shard-0000082", "num_sequences": 8192}
-{"shard": "shard-0000083", "num_sequences": 8192}
-{"shard": "shard-0000084", "num_sequences": 8192}
-{"shard": "shard-0000085", "num_sequences": 8192}
-{"shard": "shard-0000086", "num_sequences": 8192}
-{"shard": "shard-0000087", "num_sequences": 8192}
-{"shard": "shard-0000088", "num_sequences": 8192}
-{"shard": "shard-0000089", "num_sequences": 8192}
-{"shard": "shard-0000090", "num_sequences": 8192}
-{"shard": "shard-0000091", "num_sequences": 8192}
-{"shard": "shard-0000092", "num_sequences": 8192}
-{"shard": "shard-0000093", "num_sequences": 8192}
-{"shard": "shard-0000094", "num_sequences": 8192}
-{"shard": "shard-0000095", "num_sequences": 8192}
-{"shard": "shard-0000096", "num_sequences": 8192}
-{"shard": "shard-0000097", "num_sequences": 8192}
-{"shard": "shard-0000098", "num_sequences": 8192}
-{"shard": "shard-0000099", "num_sequences": 8192}
-{"shard": "shard-0000100", "num_sequences": 8192}
-{"shard": "shard-0000101", "num_sequences": 8192}
-{"shard": "shard-0000102", "num_sequences": 8192}
-{"shard": "shard-0000103", "num_sequences": 8192}
-{"shard": "shard-0000104", "num_sequences": 8192}
-{"shard": "shard-0000105", "num_sequences": 8192}
-{"shard": "shard-0000106", "num_sequences": 8192}
-{"shard": "shard-0000107", "num_sequences": 8192}
-{"shard": "shard-0000108", "num_sequences": 8192}
-{"shard": "shard-0000109", "num_sequences": 8192}
-{"shard": "shard-0000110", "num_sequences": 8192}
-{"shard": "shard-0000111", "num_sequences": 8192}
-{"shard": "shard-0000112", "num_sequences": 8192}
-{"shard": "shard-0000113", "num_sequences": 8192}
-{"shard": "shard-0000114", "num_sequences": 8192}
-{"shard": "shard-0000115", "num_sequences": 8192}
-{"shard": "shard-0000116", "num_sequences": 8192}
-{"shard": "shard-0000117", "num_sequences": 8192}
-{"shard": "shard-0000118", "num_sequences": 8192}
-{"shard": "shard-0000119", "num_sequences": 8192}
-{"shard": "shard-0000120", "num_sequences": 8192}
-{"shard": "shard-0000121", "num_sequences": 8192}
-{"shard": "shard-0000122", "num_sequences": 8192}
-{"shard": "shard-0000123", "num_sequences": 8192}
-{"shard": "shard-0000124", "num_sequences": 8192}
-{"shard": "shard-0000125", "num_sequences": 8192}
-{"shard": "shard-0000126", "num_sequences": 8192}
-{"shard": "shard-0000127", "num_sequences": 8192}
-{"shard": "shard-0000128", "num_sequences": 8192}
-{"shard": "shard-0000129", "num_sequences": 8192}
-{"shard": "shard-0000130", "num_sequences": 8192}
-{"shard": "shard-0000131", "num_sequences": 8192}
-{"shard": "shard-0000132", "num_sequences": 8192}
-{"shard": "shard-0000133", "num_sequences": 8192}
-{"shard": "shard-0000134", "num_sequences": 8192}
-{"shard": "shard-0000135", "num_sequences": 8192}
-{"shard": "shard-0000136", "num_sequences": 8192}
-{"shard": "shard-0000137", "num_sequences": 8192}
-{"shard": "shard-0000138", "num_sequences": 8192}
-{"shard": "shard-0000139", "num_sequences": 8192}
-{"shard": "shard-0000140", "num_sequences": 8192}
-{"shard": "shard-0000141", "num_sequences": 8192}
-{"shard": "shard-0000142", "num_sequences": 8192}
-{"shard": "shard-0000143", "num_sequences": 8192}
-{"shard": "shard-0000144", "num_sequences": 8192}
-{"shard": "shard-0000145", "num_sequences": 8192}
-{"shard": "shard-0000146", "num_sequences": 8192}
-{"shard": "shard-0000147", "num_sequences": 8192}
-{"shard": "shard-0000148", "num_sequences": 8192}
-{"shard": "shard-0000149", "num_sequences": 8192}
-{"shard": "shard-0000150", "num_sequences": 8192}
-{"shard": "shard-0000151", "num_sequences": 8192}
-{"shard": "shard-0000152", "num_sequences": 8192}
-{"shard": "shard-0000153", "num_sequences": 8192}
-{"shard": "shard-0000154", "num_sequences": 8192}
-{"shard": "shard-0000155", "num_sequences": 8192}
-{"shard": "shard-0000156", "num_sequences": 8192}
-{"shard": "shard-0000157", "num_sequences": 8192}
-{"shard": "shard-0000158", "num_sequences": 8192}
-{"shard": "shard-0000159", "num_sequences": 8192}
-{"shard": "shard-0000160", "num_sequences": 8192}
-{"shard": "shard-0000161", "num_sequences": 8192}
-{"shard": "shard-0000162", "num_sequences": 8192}
-{"shard": "shard-0000163", "num_sequences": 8192}
-{"shard": "shard-0000164", "num_sequences": 8192}
-{"shard": "shard-0000165", "num_sequences": 8192}
-{"shard": "shard-0000166", "num_sequences": 8192}
-{"shard": "shard-0000167", "num_sequences": 8192}
-{"shard": "shard-0000168", "num_sequences": 8192}
-{"shard": "shard-0000169", "num_sequences": 8192}
-{"shard": "shard-0000170", "num_sequences": 8192}
-{"shard": "shard-0000171", "num_sequences": 8192}
-{"shard": "shard-0000172", "num_sequences": 8192}
-{"shard": "shard-0000173", "num_sequences": 8192}
-{"shard": "shard-0000174", "num_sequences": 8192}
-{"shard": "shard-0000175", "num_sequences": 8192}
-{"shard": "shard-0000176", "num_sequences": 8192}
-{"shard": "shard-0000177", "num_sequences": 8192}
-{"shard": "shard-0000178", "num_sequences": 8192}
-{"shard": "shard-0000179", "num_sequences": 8192}
-{"shard": "shard-0000180", "num_sequences": 8192}
-{"shard": "shard-0000181", "num_sequences": 8192}
-{"shard": "shard-0000182", "num_sequences": 8192}
-{"shard": "shard-0000183", "num_sequences": 8192}
-{"shard": "shard-0000184", "num_sequences": 8192}
-{"shard": "shard-0000185", "num_sequences": 8192}
-{"shard": "shard-0000186", "num_sequences": 8192}
-{"shard": "shard-0000187", "num_sequences": 8192}
-{"shard": "shard-0000188", "num_sequences": 8192}
-{"shard": "shard-0000189", "num_sequences": 8192}
-{"shard": "shard-0000190", "num_sequences": 8192}
-{"shard": "shard-0000191", "num_sequences": 8192}
-{"shard": "shard-0000192", "num_sequences": 8192}
-{"shard": "shard-0000193", "num_sequences": 8192}
-{"shard": "shard-0000194", "num_sequences": 8192}
-{"shard": "shard-0000195", "num_sequences": 8192}
-{"shard": "shard-0000196", "num_sequences": 8192}
-{"shard": "shard-0000197", "num_sequences": 8192}
-{"shard": "shard-0000198", "num_sequences": 8192}
-{"shard": "shard-0000199", "num_sequences": 8192}
\ No newline at end of file
diff --git a/open_lm/model.py b/open_lm/model.py
index 3c00cc4..ba2dd1b 100644
--- a/open_lm/model.py
+++ b/open_lm/model.py
@@ -509,22 +509,23 @@ def create_model(args):
 def create_classif_model(args):
     model = Transformer(create_params(args))
     
-    checkpoint = pt_load(args.classif_model_path, map_location="cpu")
-    model.load_state_dict(checkpoint["state_dict"])
+    if args.classif_model_path is not None:
+        checkpoint = pt_load(args.classif_model_path, map_location="cpu")
+        model.load_state_dict(checkpoint["state_dict"])
     
     dim = model.output.in_features
     model.output = nn.Linear(dim, args.num_classes, bias = False)
-    
+        
     return model
     
     
-def test_classif_model(args, model_path):
+def test_classif_model(args):
     model = Transformer(create_params(args))
     
     dim = model.output.in_features
     model.output = nn.Linear(dim, args.num_classes, bias = False)
     
-    checkpoint = pt_load(model_path, map_location="cpu")
+    checkpoint = pt_load(args.classif_model_path, map_location="cpu")
     model.load_state_dict(checkpoint["state_dict"])
     
     return model    
diff --git a/open_lm/params.py b/open_lm/params.py
index f74fa89..1b5f79f 100644
--- a/open_lm/params.py
+++ b/open_lm/params.py
@@ -804,7 +804,8 @@ def parse_args(args):
         type=str,
         default=None,
         help="Path of the pretrained model to be finetuned for classification.",
-    )    
+    )
+    
     add_model_args(parser)
 
     config = maybe_load_config(parser, args)
diff --git a/open_lm/positional_embedding/__pycache__/__init__.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/__init__.cpython-310.pyc
index 3a8f889c0fd95673df95bb5d4b6d5e8f6ef26714..926d8c8d8f24f9375bbc2b2ac6aad53f3868f05a 100644
GIT binary patch
delta 19
ZcmZ3$xPXy6pO=@50SNZ~J~NSfDgZ751(pB+

delta 19
ZcmZ3$xPXy6pO=@50SG3QHcaH63IHiG1epK;

diff --git a/open_lm/positional_embedding/__pycache__/head_rotary.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/head_rotary.cpython-310.pyc
index 9904228a597d336ae6ffd9285468f8c4e42e62a6..9a508c1979ce1ae5897e26c0944f1e70e1e68b01 100644
GIT binary patch
delta 20
acmaFJ_mGb}pO=@50SNZ~KC_WKj2!?!8wMHx

delta 20
acmaFJ_mGb}pO=@50SG3SHf-b$V+Q~^RRtaZ

diff --git a/open_lm/positional_embedding/__pycache__/llama_rotary.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/llama_rotary.cpython-310.pyc
index 47159ae62c9e0a09fc02a8203f66a58a7ddf8142..8d8be0b0ed132a7bae7ea1ec7e860aa1bc364ead 100644
GIT binary patch
delta 20
acmZ3ayGWNipO=@50SNZ~KC_W~x)=aDt_CUq

delta 20
acmZ3ayGWNipO=@50SG3SHf-dcE(QQJ=mjnS

diff --git a/open_lm/positional_embedding/__pycache__/none.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/none.cpython-310.pyc
index 296bc4ea4ac192eb82999a89aaa36cd065ea7174..5dbdb82aee29f4e6ec1c7a8ea95405ab8f67b130 100644
GIT binary patch
delta 20
acmcb_bcu;OpO=@50SNZ~KC_YAoe=;#4F%5t

delta 20
acmcb_bcu;OpO=@50SG3QHf-c}X9NH>Km^hN

diff --git a/open_lm/positional_embedding/__pycache__/rotary.cpython-310.pyc b/open_lm/positional_embedding/__pycache__/rotary.cpython-310.pyc
index d4bd893eeca9abaed40259b35b90aa022ad498de..6c775c685b78ccd63a968a1748e7ddc8851524c5 100644
GIT binary patch
delta 184
zcmew>^H+vDpO=@50SNZ~KC_X#nTs)Mb3d0DBbyOWsK|KoYVK`}rjwI+ep#4;1Pnlg
z4v^4fEm8)tA<BTPTa0Bz0w57J5Wx!~_&|i|<SD$Gie^BjCR>p?NW=m}Sb_*^5CJyV
scJe)52?;wOKLUu0S%3rwgA8L4NTkSYG9RBTqxEDRJ}E}K$?kk+02EFln*aa+

delta 184
zcmew>^H+vDpO=@50SG3SHf-c>=3>0Hxt~jnk<Az=RAe%FHTO0~v&l(3zbwo^0tO&L
z2S{kL7Ab?+5M@BtEyl7U0g#9qh~Nbgd?3PX@)TZ8MROoildZ@CBw`67tU!bfhya^w
sH~Ai~goHhi9|6S0EI@*TL58siBvNEPnU7DF(Ppv^pA@70WOqI@0K^?4G5`Po

diff --git a/open_lm/run_bench.sh b/open_lm/run_bench.sh
deleted file mode 100644
index 5676646..0000000
--- a/open_lm/run_bench.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-BATCHSIZE=1
-MODEL="large2048"
-EXP_NAME="benchmark-$MODEL"
-
-torchrun --nproc-per-node 1 -m benchmark.main \
-    --train-data "pipe:aws s3 cp s3://s-laion/redpajama-tars/8192-v1/{0..7}/shard-{0000000..0000300}.tar -" \
-    --train-num-samples 30720 \
-    --workers 6 \
-    --precision amp_bfloat16 \
-    --grad-checkpointing \
-    --grad-clip-norm 1 \
-    --log-every-n-steps 1 \
-    --fsdp \
-    --profile \
-    --batch-size $BATCHSIZE \
-    --model $MODEL \
-    --name $EXP_NAME \
diff --git a/open_lm/test_class.py b/open_lm/test_class.py
new file mode 100644
index 0000000..d0e9a0b
--- /dev/null
+++ b/open_lm/test_class.py
@@ -0,0 +1,76 @@
+import os
+import shutil
+import random
+import json
+import torch
+import numpy as np
+import subprocess
+
+from open_lm.params import parse_args
+from open_lm.model import test_classif_model
+
+device = "cuda:3"
+
+def inference():
+    
+    args = parse_args([])
+    args.model = "open_lm_160m"
+    args.classif_model_path = "/workspace/youssef/lrz/logs/rewritten/classif160M_3.2BC4_C4_FW_320M_prompt3/checkpoints/epoch_1.pt"
+    args.num_classes = 2
+    
+    model = test_classif_model(args)
+    model = model.to(device)
+
+
+    test_data_path1 = '/workspace/youssef/lrz/datasets/test/rewritten/C4_test_prompt3.pt'
+    test_data_path2 = '/workspace/youssef/lrz/datasets/test/rewritten/FW_test_prompt3.pt'
+    
+#####################################################################################################################    
+    dataset = torch.load(test_data_path1)   
+    sum = 0
+    for sample in dataset:
+        sample = torch.LongTensor(sample).to(device)
+    
+        with torch.no_grad():
+            out, _, _ = model(sample)
+            
+            pred = torch.argmax(out,2)[:,-1]
+            
+            n_correct = torch.sum(pred == 0).item()
+            
+            sum = sum + n_correct
+    
+    sum1 = sum
+    len1 = len(dataset)
+    print('C4', sum1, "/" , len1)
+
+    dataset = torch.load(test_data_path2)
+    sum = 0
+    for sample in dataset:
+        sample = torch.LongTensor(sample).to(device)
+        
+        with torch.no_grad():
+            out, _, _ = model(sample)
+            
+            pred = torch.argmax(out,2)[:,-1]
+            
+            n_correct = torch.sum(pred == 1).item()
+            
+            sum = sum + n_correct
+    
+    sum2 = sum
+    len2 = len(dataset)
+    print('FW', sum2, "/" , len2)
+###############################################################################################################################
+
+    total_sum = sum1+sum2
+    total_length = len1+len2
+    
+    print("Total= ", total_sum, "/" , total_length ) 
+    print("Accuracy= ", total_sum/total_length * 100, "%")
+
+
+if __name__ == "__main__":
+    print("starting script")
+    inference()
+    print("ending script")
\ No newline at end of file
diff --git a/open_lm/train_class.py b/open_lm/train_class.py
new file mode 100644
index 0000000..1a19480
--- /dev/null
+++ b/open_lm/train_class.py
@@ -0,0 +1,68 @@
+import os
+import shutil
+import random
+import json
+import torch
+import numpy as np
+import subprocess
+
+from open_lm.params import parse_args
+from open_lm.model import test_classif_model
+
+device = "3"
+
+def train_classifier(cuda_devices=device, log_dir="/workspace/youssef/lrz/logs/rewritten/classif160M_3.2BC4_C4_FW_320M_prompt3"):
+    # Set the CUDA_VISIBLE_DEVICES environment variable
+    os.environ["CUDA_VISIBLE_DEVICES"] = cuda_devices
+    
+    # Generate a random master port between 10000 and 65000
+    master_port = random.randint(10000, 65000)
+
+    # Construct the torchrun command
+    command = [
+        "torchrun",
+        f"--master_port={master_port}",
+        "--nproc-per-node", "1",
+        "-m", "open_lm.main",
+        "--model", "open_lm_160m",
+        "--dataset-manifest", "/workspace/youssef/lrz/datasets/rewritten/0C4_1FW_prompt3/manifest.jsonl",
+        "--train-num-samples", "320000000",
+        "--workers", "1",
+        "--precision", "amp_bfloat16",
+        "--grad-checkpointing",
+        "--log-every-n-steps", "100",
+        "--grad-clip-norm", "1",
+        "--global-batch-size", "16",
+        "--data-key", "txt",
+        "--lr", "3e-4",
+        "--warmup", "2000",
+        "--wd", "0.1",
+        "--beta2", "0.95",
+        "--epochs", "1",
+        "--resume", "latest",
+        "--logs", "/workspace/youssef/lrz/logs/rewritten/",
+        "--name", "classif160M_3.2BC4_C4_FW_320M_prompt3",
+        "--classification", "True",
+        "--num-classes", "2",
+        "--classif-model-path", "/workspace/youssef/lrz/logs/pretrain/160M_3.2BC4/checkpoint/epoch_3.pt"
+    ]
+
+    os.makedirs(log_dir, exist_ok=True)
+
+    # Create log file paths
+    stdout_log = os.path.join(log_dir, "output.log")
+    stderr_log = os.path.join(log_dir, "error.log")
+
+    # Run the torchrun command using subprocess
+    with open(stdout_log, "w") as out_file, open(stderr_log, "w") as err_file:
+        try:
+            result = subprocess.run(command, check=True, stdout=out_file, stderr=err_file)
+            print(f"torchrun finished with return code: {result.returncode}")
+        except subprocess.CalledProcessError as e:
+            print(f"An error occurred while running torchrun: {e}")
+
+
+if __name__ == "__main__":
+    print("starting script")
+    train_classifier()
+    print("ending script")
diff --git a/open_lm/utils/__pycache__/__init__.cpython-310.pyc b/open_lm/utils/__pycache__/__init__.cpython-310.pyc
index abb15c25daa109adc7a0ad2c34f9abf6800dea50..87b50facbdd8de6dd463be7ca8bb6d0cda6605ea 100644
GIT binary patch
delta 19
ZcmbQpIFXS%pO=@50SNZ~J~NTK4FD_G1!({P

delta 19
ZcmbQpIFXS%pO=@50SG3SHcaGh0{|xt1Z@BS

diff --git a/open_lm/utils/__pycache__/averaging_utils.cpython-310.pyc b/open_lm/utils/__pycache__/averaging_utils.cpython-310.pyc
index 5a3427a692a41c8738766a63d4451adb0b58f88b..8d4b678211fb0246577b7c9e5e8cd6fc624252f1 100644
GIT binary patch
delta 20
acmZ1{yH1unpO=@50SNZ~KC_W~F&_XrMg}GT

delta 20
acmZ1{yH1unpO=@50SG3SHf-cx%m)B8fCVZ5

diff --git a/open_lm/utils/__pycache__/make_wds_manifest.cpython-310.pyc b/open_lm/utils/__pycache__/make_wds_manifest.cpython-310.pyc
index 32b1b1b9606d76cd4831977b65e1a309d552f2b3..a3716f4b7e8670240aa6ad2289e66a5eaca1b3fd 100644
GIT binary patch
delta 564
zcmXX?O=}cE5KYg{Om=6Sd~9T8!}>wK%orgdkQ@SnIe74t1uxPFV<*!|24}iE)jcd;
zy?F4Vv<LGOJW3AYA^*T%Fd!cM1)fE)Yu1K-)T?^+x~eY+?*_3<l4!=A>p$PW#A6aa
z+MY;wr4>8jSB9(+LhYsZf!wfLo#%^m?hwxEiVHJlpgB<cv9mLzp=ud)ozE0{81?Os
z&Nk`W#Q%Di#<`q`QnEZV5*A&#iFAFeK%Q`|sr}_YbT2IgM`UDQ27H+&qOLN{FKRBF
zBK4X;tFkaKvOj~3a2frFHV*g5>eXR5A!J~`&3@V+V15Mw6{!d77~gU5nZOdp?Tqbs
z_Z@wLsZIN~Tab18yF1*#<dM7Db8vH}A(MtgX&?HIHldbacn&Ler#ITC{#eSA#yBYc
z83zr%lhL`NVk)bJgq(wuueQBRDK2tq-}H8$;v_WG=L~WU_mKF?c{P5{45RVXEo(_q
zF;SdyZ8F8odE=cb*m4ciJyQF(_xRa?<4rfqDprUrg9nbLOE8bB0AUa79sOFvEf>-G
nsxD3O|K|&F2D@0FMf<Wi23(xl_tEVI+!;m)xB(p4>*&QlZ|Il3

delta 614
zcmZ8fJ#Q015M}SfUxx!I*eC`RBm=@2bWW^LP>}+PP+(C}ARU+NVz0Q{-OTREN(B{7
zDzpv!1Vn*Q3VsGkL9>5?Ul76085&Nyr+G7P-^|?YasN|4k=<_RoPTWf?prb=8&6ZJ
zHNFd3vaNLTB&B@1Gp`O$UXz=IIC~s?AYFSsd~wOG?ZUh&nJ}{)Gy`Yfhm#H0P&I>=
zcU?gnqn`Z{rle<ohu=3{Lgi8vGN+l5aM_n#M_2m_<N?##*`Lup-zkWnk)eGavw`c1
zs?0PyteEhMQQ%^&3T|L%PvhH-8|c4j&*B|2KAAR_gbeJL=EnyE%-=;o#VCOL7~k^n
znLr=odd7OZ)YHe9n%JWjzlus1al^|0_ekwo>&mMh)*D)aVYF`HnqCZ)gNqe#VO&@+
z;et}RXeMRh65L=Z;SB=~5uRS-Wv;SHNT3Y7%ec18m12TAYuejm9MLyf<UlptK@utS
zifC?fm#o~fn%L9!dNTEr)wo;sSNqZ1Jv`FNvNGo)%V68n(K=j2RgAEU^-v#axQ5Y%
zvMLP!`xCgB!$ZtARc;mq--iI%`GZ1M^j|9Usd^c}I|TSqV2FUb0NJx&J1_qL^dzu4

diff --git a/open_lm/utils/llm_foundry_wrapper.py b/open_lm/utils/llm_foundry_wrapper.py
index 166d42a..f4f14e7 100644
--- a/open_lm/utils/llm_foundry_wrapper.py
+++ b/open_lm/utils/llm_foundry_wrapper.py
@@ -4,14 +4,14 @@
 """Implements a Hugging Causal LM wrapped inside a :class:`.ComposerModel`."""
 
 from typing import Mapping, Union
-
-from composer.metrics.nlp import (
+from llmfoundry.eval.metrics.nlp import (
     InContextLearningLMAccuracy,
     InContextLearningLMExpectedCalibrationError,
     InContextLearningMCExpectedCalibrationError,
     InContextLearningMultipleChoiceAccuracy,
-    InContextLearningQAAccuracy,
-    InContextLearningCodeEvalAccuracy,
+    InContextLearningGenerationExactMatchAccuracy,
+)
+from composer.metrics.nlp import (
     LanguageCrossEntropy,
     LanguagePerplexity,
 )
@@ -33,10 +33,9 @@
     LanguagePerplexity(),
     InContextLearningLMAccuracy(),
     InContextLearningMultipleChoiceAccuracy(),
-    InContextLearningQAAccuracy(),
+    InContextLearningGenerationExactMatchAccuracy(),
     InContextLearningLMExpectedCalibrationError(),
     InContextLearningMCExpectedCalibrationError(),
-    InContextLearningCodeEvalAccuracy(),
 ]
 
 
diff --git a/open_lm/utils/transformers/__pycache__/__init__.cpython-310.pyc b/open_lm/utils/transformers/__pycache__/__init__.cpython-310.pyc
index e4bee557ab2b58e77e0c031db7c3aadcc7ebe0d9..5285aaca3c9e545a6b8201fdfc35004b6a965e30 100644
GIT binary patch
delta 19
ZcmbQoIFFG#pO=@50SNZ~J~NSfG5{^O1&{y$

delta 19
ZcmbQoIFFG#pO=@50SG3SHcaH63;-zZ1e5>(

diff --git a/open_lm/utils/transformers/__pycache__/hf_config.cpython-310.pyc b/open_lm/utils/transformers/__pycache__/hf_config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6a0f302ae27e78753248a6980c896ea333914d1
GIT binary patch
literal 1831
zcmb7FOK%%D5GJ_~tzL%fx@l9S0B@~wkb5?YAZ~&_fK)M%p0*7Z>m^rSdmjo(#ZaJ}
z6d<?$ieB;~`VaaGc<G_1{)IMu%y1=H2HHa@@G&HJ=9`&ssAf3q5okaC@pJx5K*+Bs
z>^2{S`_T3GFbE=ONh%ssifdM~%8R|qkNql$1GnduVbzH{Zta&*)s4Gu9hAMQANQ+4
zJfP%DB0|x5MMOt-S6)06(GxQ2{stDvh}tkOWGVE>v;Lz(8yh{InWCxF()tgJ%)pKA
zB(0>LrI`d_|5Qn%(xR5)K~v|&bi{0UnyR$Yc9<!d8kx+Tn4y8)7Oe$+3JXbMDoD%(
zjlC|zpXdos`ZBmpgb#BFa{zPaHp#=+B&W`aVI0A&AZH>H-B*6xeNB84_rRgP9Xy`N
z`q4KntT=2yDyxYNTR67g&Pl4K+V1ZNi2y${4E?4-YCtU^edzig42G=81r@9yOaDju
z10-Y#$mdkOW7v|d=u2<uElJ`nsoVW%NkQ@jeF7;2Y+m{?Bc#XI?d!TQMOqdwBsb@h
zfAvKRnP+9H^_ZX33*O}J$eb%FIUt@_vNoD$Y0b|hpKB?&X?Sr{QTzSY%%m}BAT>@^
zQEQXdnRGg)(r_(R3qtU#b6N8l7%%E6DCM#MTnctDtuA1^xgolcjeA>DClJ%)t%Pp2
z-||m3Z2IohDSJ9P=HG(P?pPYBG;hu>WM;<y@^x$ZTE&)!8!h#Qm}@0GYia}BL^m+9
zJ65(a{7*y9ryan^{*57I2o{A}{C^X7?_ORZ=Z^V_5RiDREq*B%SB(<aslA3Zc|%5l
z4fK2_m5q`Flbj^hf1WC>hCoJzw1g|ZmSye)HNb5=vteuO4U}z?I6@?e+JlqdKR*JC
zm#&VF7R_91nIB<|BxSXo=cXw25!83B^F~#Wo}=?Tfv7^Ek7o<D4~o!?MvVst5sPTZ
zBG#k+-!I<VEx+*wcLv1YHV;-E=ntUlPhnWn3j&ow4Kv<~J|s_vFPZU|P+b>+389K$
zUa^%&$+Jihs3bZWb>hL5ET-osF$+kveb*E+!C)q`I+GCMda4y#Rrui8j+Q1hMp+-(
zVmm;66i1#qz&jXA8@M;ng{d8G;cgGUT)$-_kTt4{=zxBJ-oAgs+nt-<WHN!r2oi_(
zAaT7UsTwgaaXm<qXY;h&oWQ;;!41eXJ~s}q3UP%mkc+mu11mdR=Q5LJxz1+`^bqeM
zT7`cs*!71nM8v1eXOvO@ZOGVPp~wCV{k^xh?znrC(Vp##)TE9evJICF0SE+7VU}7v
zh{8rdUMM7ZE)ba=?6N~)Hsj%%Mq}qs>qQrpAfBD5B55308-3P><Q|1BjqDweLkR7F
I?XyGgA4lr$1^@s6

literal 0
HcmV?d00001

diff --git a/open_lm/utils/transformers/__pycache__/hf_model.cpython-310.pyc b/open_lm/utils/transformers/__pycache__/hf_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd1a289b914351ffcb2b3dc5e288c8fc90fdcf2c
GIT binary patch
literal 7376
zcmb7J%X8dDddF*KFb_UOQKGB|r1jbyg&Aqf@os8k$yW5RwJT97i&<A-D<C-CBtc?;
zQ8z{sr!bXdDcA00_mZj{vX3IGa#2+d$sdqwPWb~;xaQ>ZX7gBQ*ZF-73^_xx;sjG@
zbT=CPef_<^Z{mrGnuhBizx?O*UtZ9(f2G0bWnu6EZuw7Wn8x%_>*%hoQ{M=Uj_I1J
zZ-!Q<<d#(53d@~}TT%T|Xm_e^RrSkZty6dForc>`<4QQ;PGFx(IN6zUr_`JsPIqS9
zna->`+d1W)>dd)w#oE*E=?^tlWwlottMSsV>CR(ZXAO)Sd>-R7{4AT;HiXZsdkbuG
zUw6;(H~2h1x2x|G#+4`PZccrHvuI5{t1kN;F1vo4V^C{v^7eK&4x$93smFq^2tSB8
zdl)kwVru@j-;;j$;NhcQ((NUm1j*)OU(({nqb`pgJiHx8>%m6T$m;kqlIyYPaFI>4
z1@{x~b$#J?<Z{!>N`Ddsag>!-dfkv`_M>isp&w?}gFq&2!fo_2i-Ej@4%b|rX|BOd
z*ZPsRt}~q(uWYx(%gkifD{WVIE4;F0v(j(`idC5XN_VS2(k#ubJ<*!AtezvRPS!R?
zpb@YVZhFecXc8?2kX=JmQ+-Px7*Q=T5|fz$=C`cB)qe`o4vbXaE2YLhh@6ynwY^HJ
zQ=b{B=GjR#HFkCMYD@!gYdI_Dq*!^aHS1Yr)0cjd2r&t0!ls50%BJ&0o-a0}$AWf}
z)g(^}DrP0w>w>nn=LJ!ac%FL+6Wv%O(%TR|3wV@xhx`np4N;*@OC7~aGqWTQ*Ryh4
zaZG`J4RP?icIZp#d14Bye{<{Fi&$(c`M9<d_oU?O*J5zm3p)qTUJ``zT5`y5d2Msu
zQ(#)%9WjS}rg2L;UCprdIisdG^s+Ij_vc2K&>9}3tz+TvqCVZXasP@)lWJSqmi|P0
ztuMbe#A(cAvvl6^v%c2FJYK0o<jpx+Ie#)`&f}dVlG{fA$|y=FFCP;Z`iyRcU;~{L
zjNZ~b$mkPI(9QxzMFXycP*!^Ghdq7-FX7-7-3p0@vD_x%8(w68NFcwDThb=9KLkrp
z4W|FxNKFz!h?Thvy{iTh6w?cs%qCP<dc3p7nN%P_7NS_w%Bk}v`r?~tvhwx|lCmRs
zWfdqeUNBmyaGTfrr$z}>yl)!I%MfJ9Vgz&Id0EXP0qBL)Z+PCbo*(9KM47-fa;eK@
zO-V>*cLfAXBs+>2Ss9X^a3L0H`9vX=Z61b&WagNj#als~6%%OamLzJnX6X&XXw-De
ztm(6+dfUc}v1_3i$#W2}p;ck}hVGimt{BXOU9p(OO0P7x#LBFKXPMcoif4t@SRGHB
zHP{56RW`|{@T^s*M}9MTpS{Iqd7U@5CKfby>Yz8LdXtztjouXAPcLW~&8vCz*_neC
zXVII%%tG-zS3KV+p6B^2KgH+PjRkFeo?T!UUzOa`)oJ!NyTlyWzIlFzU1nF5eLTw-
z*f;RqEQfd4I#1u=D?^?7P4>-K4cO2N%2r;4TrFlZIm^N9q5bbqh8=H{`>geX2OFEo
zF_j<zrdwV_lW4GrdSMYEz8<XNp1BXjK2Q>f<SDFFvfx&V&E0A@mJnvxw`Q(C;tVZP
zY$yAa^PVis@8OmtTd*wPegE7sl)v7-h&O0UQrV$pnM~sCTz?o0s1%oIfdYI0J)7Hr
zP0qDWX_{L3EiB9}ir#<gI4UQ7a*TvSt0FEB(Gs++xN;H_<QR@3@draB8Yd!g(kI7k
z?+^(|F%lV`J$-*@*uWEIe^1kKVigg54Y9<#*yTI8e?|M&^#0o?)8^P_$A~lZ36iX1
zqWw@;d_y2|YB^5scMw%{-~c(hD?}^`;c+J;{$dovBb+=<6gdX^y8xhAX`P7fiQgPU
zcbtqA3LJ{Qcnc5;6tfEBZ8&IgRLbfEr+9Y*>&3Ux^e-GI-^jXS%se0(DGD)j=%EOM
z6NS4@tS!KPfJKTBW&I?;BP*Bh3Gx`(B%&H{0im7O@#S{bfI0_iNd9T6i_p>A<~yG9
ziL&rSp})pMnN@p|du<;POg2;S<X|_M%@tFd0b__pWP<QgW_7`no`~{Tshkhmbzm~Q
zX3?Q2=r(TocW6?R={-%_--AQl)qkj5HympzwbBaQtND}CXXc<jXbdc-4<^zwR_o$n
zT8C56S?O~)V5Wa=teXo;F9wrXVZl+*9E>XI%3vxn$kq*})5$%!60{{&+Sm6Dte&Eo
zeSNQ-PHt89^}$q%Rq52OF_=i~v<@sLSmh_?U?!b`A-4V3koKBWT({t`zk4vE$((e~
z;%X9$_Garr9Br)dNXBB-iPs&u>36xay13>i?M+VxeZJ&K{;bENHupjvZ6uq`)g|X@
zTm<7+o8!9^ga?W_4m^R=jys4nU>2QZlRLy;&VWI*(epPjrYMtMhOv|_=YE9vl}q3d
z_d@1yBo16SYupJUXZ6$TOHQlRS{geGLCd3r*7LaSuPKC|;vmBL{cl~r?kq}<!yH6A
zs}76XG7$jTY&k3OHqPz502n|hK=Z^&VrP}UUUhr{*c(wSI6DS*F+WUmNu7)i-~`f%
zqj0A<G!9A>pmW1P96&0_KYFB-j?g*)_^}e#5hmV!>67U?r2FdXYIi5ujH82ZZr;4<
ztc$qgj0W6J(4olRxz$Tzb#^dUEIG<Lgnq}rcj&ptHd0)2Zk)KVMNqshv?$L37a!kR
z`N&&&<lTO>{DZq6uiU-keRB8yhaasxX^!owC_?bD?-)M+hYeRAegE$AefRFi<69y$
z>~s_CcjY6#v!r;3WQ1rd+d<qD@_pyFA2~Fa_~ACOxx?STGKN+j3}cT2{1)?Vmz=_C
zz*Iq1<c%xc<jOewZ#W<Fhzr=5u`M@>Ns6|yah~xECl_;otpf>Nf{wf_^z6nv*T*3o
zAl1sXGdH=5!_5c+lyy7kda}#gfggJ63wdKj^q|J=ko%F>>*ljT-&aOfrET)YJwKGZ
z`RVni2gv`w$oN@w|Lu+w#mNZ0mz=d;;@o%Ip!$&FpnmI+<PhBD{>8_Iy?5^4k&11!
zO_8Q@)+{MEq3pi632Q7qpyoX^tuMa9^#wVsrkRz)cq2%pT9a9%kj-iydek5TNwCpF
zYML2Qluh3jczZWWkWB16KsU3V2mA$+T}WhRBym<|{5edKpp2?HJCX+xSfO02LWR;I
z@jf*lP;-l#Kc<F!hxiaprhDQeJl%%e4Av7bNAh5#ETwA`w%$5&_j2yCjyQIe;3#)Q
z9&_5U3_~CO8edu!TR*4I>9#(LUrnDh5QHooclxkKl%}=mP{XT=4*7YNHvSYH_%jXO
zYmw4MBoeqOja<+MWz;cLFD-9d;!Ii|R3g+b@;PRl)dm=$NTq#lq!nt(w+-~vQv1ED
z_SM9@%uKC7AJ`F6rc|e1Y9s15($bzmUJkyt{Bx74BB>2;R#_IrUU82aMQx&hqIT0f
zA`Xy&l5$^Jxz}aT!^}ju0U=OHzWPovP4Q6e^)~j(>MFv>U7I^6cqe|2B9!3k!B7#i
zhSJ(RbUv2;Ucg$^V0iJ`1DU--B@xrS1PJM%vCUb1(k$biGp3Pn+6K6YLWiMSzqP)y
zD*bE6IrhKqJQhB}<tsj_Y^sF;A`v6i?r2Y^2WDy{+F`Pqns-qC+SS+f?`Q+#nUxx#
zv$d>5ZY&t2aFk@_9o+Ifn!iLi__PY1>U-exkF|Z{2guHzK~k`Tp%}GHRX#XV{?bF`
zh_{iFWk&0|TT?_<5f}<c_8Rv)Fjp^gIagJe>U%`AmtAr5ZwMY8$oPwMTK}EnWbp)#
z1;XDv@@u^p!u5ypX6s-FAW=%FXo!SSvHVl)=1vu#ag>25WOQqxzk^)}wHy%_%T%_Q
zO^(C(YeI<%OBOhGQHOrz-tZxqJ<N7n<DX<C7ll{M>s)!+1Jcp;lg%;iQM5@1rYKbY
z3z`Jgt34EN_I0tAA~Hk-32G}%T1&t&rJES<nn&g^-ZckiQc6u!7!7eXv64z^B33k^
zS)|$TB$YioRT`F5nMDQPe7~*Q0BiRUPa?)FrHDUC>+^y}THb5yLz+;-q|!#^l{Pw)
zOzdh~lPI8;(sBS@&Pz@CYzjF;4YZhR)|ORro=t)NMG@Is4^Z8$t7z0?$xfH|FF%M;
zn?ziAxGE24SRf!%vDiV8lcISQ#E%IrDF7L?Cq&ANXaHP$TK~)#<mFcgE<;$I8h>Rx
zBf&TN6(~1qP_2wA-4*C^h&uOcQ~W7D&h(d=zH`JHXSMgJ&Pt)}&A%rwq$n4){=3H|
z^+<FyBB}>F9K)UjL19lSr?9W(*fUfOf2RH75O=*_&clfuL5}rDP!*e)`v=?#$MbpJ
zqB)frF?=40VO9zxh$>dFCab_)Qi(aM1u}?0V8r~H3FUw`QLd29<ay(|-wToO5v#IN
z7twSgR_S{<U7umcgm<JW)MQgf2+*u#W;>L^!*W5^SNLe$QE4Eo1aVQUQFEk<lVI)_
zxaAczHPiTN+L+VHoWh{ejjA!V<Ot`DIir91Yk2~ZRM$PUsE(q!B$@>IDm7FwaxYQ-
zDG@e#c{1Q3Ri&ePoj`(68>${9$V>{Vrl|Ub_(SSZSwQ>|HKf`iMw6BEY%8lBW?q6y
z!dd+uHcg7R)><5gg50y9qKb+{R2U#Wp|uSX|DfIB2>Y4%GxfQu$rm#beG|~M{TKv&
z5NxArN^{q!IV@=?@{;`(*U?ZNRj#73HL8!<W!oJ7jM6aRIDA^awhiNVwrTv<wv7L7
zl#DO!vhf?cV*FRxHvY3*HU6VqGyc6?H-24f)c(!*cI_+UVof*7S9JxAD#0E6bL4sb
z%P0Ij5a<6OczJ%;JfGPpKEu&Up8xSNpXaaY%hsWxP_mOXMiU=FD&<}nooXq?0q=@Z
zj#LEB-YkH&@?FQiz~2=)DtUE!Vz^Z8r2aO^|CLcpwsYj7NPG{lvodsr1m%#QN*;*q
zic6~Ytca4!9<j<*5K!bSpgyC;hn!PA;#_9Gms_x#6e%giXrREE|E8f94gD<IX><Aw
O?a%F*nm(;tzxy90x@TAb

literal 0
HcmV?d00001

diff --git a/open_lm/utils/transformers/__pycache__/hf_wrapper.cpython-310.pyc b/open_lm/utils/transformers/__pycache__/hf_wrapper.cpython-310.pyc
index 95532e248587a205fbaf90610255b8e338852b4e..ef492972b5d876eb5cd3cf00c571f5617ec0ebea 100644
GIT binary patch
delta 20
acmey!^^uD^pO=@50SNZ~KC_WKjTHbv2?js_

delta 20
acmey!^^uD^pO=@50SG3SHf-chV+8;^Lj^<t

diff --git a/requirements.txt b/requirements.txt
index 387b1ca..898d184 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-torch==2.2.2
+torch
 xformers>=0.0.22
 tiktoken
 wandb
diff --git a/requirements_test.txt b/requirements_test.txt
index 61f15ce..8413123 100644
--- a/requirements_test.txt
+++ b/requirements_test.txt
@@ -3,4 +3,4 @@ pytest-cov==3.0.0
 pytest-xdist==2.5.0
 pytest==7.0.1
 tensorboard==2.14.1
-llm-foundry>=0.4.0
+llm-foundry==0.9.0